# Data Analysis using SVC and KNN

In [None]:
# Importing libaries
import pandas as pd
import numpy as np
import seaborn as sns                   
from matplotlib import pyplot as plt    
%matplotlib inline

In [None]:
# loading chosen data into a dataframe 

df = pd.read_csv('Maltreatment_Types_of_Victims.csv', encoding='utf-8')

## Understanding data

In [None]:
# visualising the data inside the dataframe as coloumns

df.columns 

In [None]:
df.dtypes # returning the datatype of every coloumn in the dataframe

In [None]:
df.shape # returning the shape of data - it contains 54 elements, samples over 21 variables, 
# all of which seem to have one dimension per variable

In [None]:
df['State'].value_counts() # this returns a Series containing the unique rows in the dataframe, all of which have a count of 1

In [None]:
df['Medical Neglect Only'].value_counts() # this is the same as above, except this is a different variable, 
# which has 2 counts of '25.0' and '49.0' values  and the rest have a count of 1

In [None]:
df['Sexual Abuse Only'].value_counts() # same as above, except 2 counts of '45.0' and '39.0'  
# and the rest have a count of 1

In [None]:
df['Neglect Only'].value_counts() # same as above, except 2 counts of '3444'  and the rest have a count of 1


In [None]:
df['Other Only'].value_counts() # same as above, except 35 counts of '1.0' and  2 counts of'3.0' and '21.0'  
# and the rest have a count of 1


In [None]:
df['Physical Abuse Only'].value_counts() # same as above, only unique rows of count 1  and the rest have a count of 1
 

In [None]:
df['Psychological Maltreatment Only'].value_counts() # same as above, except 8 counts of '1.0' and 2 counts of '2.0', '29.0' 
# and 2 counts of '3.0'  and the rest have a count of 1


In [None]:
df['Sex Trafficking Only'].value_counts() # returns 35 counts of '1.0' and 2 counts of '3.0', '7.0' and '14.0' and the rest 
# have a count of 1


In [None]:
df['Unknown Only'].value_counts()   # same as above except 51 counts of '3.0' and the rest have a count of 1

In [None]:
df['Multiple Maltreatment Types'].value_counts()


To investigate the dataset, a correlation matrix and feature selection using K-best will be conducted to select the appropriate variables 

In [None]:
df.head() # these are the first few rows of the dataFrame from the original CSV file 
# no data processing has been performed yet

In [None]:
df.info() # here are the data types from each variable - most of which are integers and floats, except 'State' 
# which is an object 

In [None]:
df.isnull().sum() # this function returns the amount of the null values each variable has - these will be changed into 0 later 
# and replaced by the mode of each variable 

In [None]:
# a correlation matrix that represents the correlation values of the variables 

corr=df.corr() 
plt.figure(figsize=(15,10))
sns.heatmap(corr, annot = True, cmap="BuPu")  


## Processing data

In [None]:
# Deleted variables that show the percentages since it appears redundant, along with the 'Total Victims' variable 
# since it does not provide any additional infromation other than the sum


df.drop(['Medical Neglect Only Percent', 'Neglect Only Percent', 'Other Only Percent', 
         'Physical Abuse Only Percent', 'Psychological Maltreatment Only Percent', 
         'Sexual Abuse Only Percent', 'Sex Trafficking Only Percent', 'Unknown Only Percent', 
         'Multiple Maltreatment Types Percent', 'Total Victims Percent', 'Total Victims'], axis=1, inplace=True)


        

In [None]:
df.dtypes # ran this function again to reflect the changes after having removed several variables 


In [None]:
corr=df.corr() 
plt.figure(figsize=(15,10))
sns.heatmap(corr, annot = True, cmap="BuPu")  

# Used the correlation matrix again to recheck the correlation values now holds only purposeful variables 


Conclusion drawn so far:

-'Unknown Only' variable is less coefficent than the other variables. However,since it's quiet high (around 0.72-0.74), 
it seems valuable to keep it in the dataset. Moreover, it suggests that research into unexplored areas of child abuse types can be explored with more qualtiative research.

-'Other' is also quite high depsite being unspecfied. It is also interesting that 'Unknown Only' and 'sext trafficking only' share a postive coefficent correlation of 1. 

-Although it is clear why sex-trafficking and medical neglect are assoicated with 0.98, more research may be useful - might 
share a common variable that connects them


In [None]:
df.head() # to reflect the changes by showing the first few rows from the dataframe 

In [None]:
# counting yet again the missing values now that the variables have been reduced 
df.isnull().sum()

In [None]:
# filling missing values with the mode from each respective coloumn
# chose to replace missing values with mode because it reflects the number with the highest freuqency, 
# suggesting it is a number that reoccurs 

df['Medical Neglect Only'] = df['Medical Neglect Only'].fillna(df['Medical Neglect Only'].mode()[0])
df['Neglect Only'] = df['Neglect Only'].fillna(df['Neglect Only'].mode()[0])
df['Other Only'] = df['Other Only'].fillna(df['Other Only'].mode()[0])
df['Psychological Maltreatment Only'] = df['Psychological Maltreatment Only'].fillna(df['Psychological Maltreatment Only'].mode()[0])
df['Sexual Abuse Only'] = df['Sexual Abuse Only'].fillna(df['Sexual Abuse Only'].mode()[0])
df['Sex Trafficking Only'] = df['Sex Trafficking Only'].fillna(df['Sex Trafficking Only'].mode()[0])
df['Unknown Only'] = df['Unknown Only'].fillna(df['Unknown Only'].mode()[0])

In [None]:
df.isnull().sum() # displaying this again to see if I have successfully replaced the missing values (which I did)

In [None]:
df.shape # rechecking the shape of data now that desired variables have been selected 
# returns the shape of data - 54 elements, samples over 10 variables with one dimension per variable

In [None]:
df.head() # the first few rows of the dataframe can be seen without all the missing values replaced 

In [None]:
# rechecking correlation matrix to see if the replacing  missing values with the mode had affected the correlation (it had)

corr=df.corr() 
plt.figure(figsize=(15,10))
sns.heatmap(corr, annot = True, cmap="BuPu")  

# The correlation dropped as the correlation coeficcient betweeen sex trafficking and unknown reduced to .69, 
# but is still relatively high.

In [None]:
df.info() # checking once again to note the differences from before

In [None]:
# decided to use 'state' variable as the targey y variable 

X = df.drop(['State'],axis=1)
y = df['State']

In [None]:
# decided to use feature selection after choosing desired varibale to find the highest predictive features of the target 
# 'y' value, and for that I have chosen to use the chi2 scocring funtion and the k-best method 

In [None]:
# this imports the libraries relevant for the task outline above 

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2     

In [None]:
kbest = SelectKBest(score_func = chi2, k = 'all') 
ordered_features = kbest.fit(X,y)

df_scores = pd.DataFrame(ordered_features.scores_, columns=['Scoring']) 

df_columns = pd.DataFrame(X.columns, columns = ['Variable_Name']) 

feature_rank = pd.concat([df_scores,df_columns],axis=1) 


In [None]:
feature_rank.nlargest(54,'Scoring') # the ranking of the variables are by their score 

In [None]:
# converting processed data into a numpy since otherwise supervised learning methods cannot be used 

data = df.to_numpy()


In [None]:
data # this is the data that previously been in a dataframe, now it is a numpy array 

In [None]:
data.shape # now there are 54 elements, samples over 10 variables, all of which seem 
# to have one dimension per variable


## Preparing data

In [None]:
# converting targey 'y' variable and the remaining 'x' variable into numpy too, since otherwise the code wouldn't run

X = df.drop(['State'],axis=1)
y = df['State']


X = X.to_numpy()
y = y.to_numpy()

In [None]:
X.shape # now that y variable 'State' was dropped from 'x', it has now 9 variables 

In [None]:
y.shape # y has 54 data point from a single variable 'State'

In [None]:
i = np.random.permutation(len(data)) # this returns a random permutation of a sequence, or return a permuted range, 
# ensure randomisazation

x_train = X[i[:-10]]    # saving the first 43 samples of the 9 variables in this array
y_train = y[i[:-10]]    # saving the first 43 samples of the 1 target 'y' variable in this array 
x_test = X[i[-10:]]     # saving the last 11 samples of the 9 variables in this array
y_test = y[i[-10:]]     # saving the last 11 samples of the 1 target variable in this array


In [None]:
x_train.shape # the first 43 samples of the 9 variables

In [None]:
y_train.shape # the first 43 samples of the target state

In [None]:
x_test.shape # the last 11 samples of the 9 variables

In [None]:
y_test.shape # the last 11 samples of the target state 

In [None]:
x_train # these are the 43 samples of the 9 variables that will be trained for this supervised learning approach 

## K-Nearest Neighbors (KNN)

In [None]:
# applying the the KNN classifier

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(x_train,y_train)

# the code below generates the predictions:

knn.predict(x_test) 

In [None]:
y_test # this are the correct outcomes

In [None]:
# the KNN classifier has 11 out of 11 correct.

In [None]:
# this is the code to predict the accuracy of the predicted model using KNN

def classify(model, x, y):
    x_train,x_test,y_train,y_test = train_test_split(X,y, test_size = 0.5, random_state=40)
    model.fit(x_train,y_train)
    print('Accuracy is: ', model.score(X,y)*100)
 

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=5)
classify(knn_model,X,y)


In [None]:
# the accuracy is 11%

## Support Vector Machine (SVM)

In [None]:
# the second supervise learning algorithm used is the Support Vector Machine (SVM)

from sklearn.svm import SVC
SVC_model = SVC(kernel='linear', C = 1)
SVC_model.fit(x_train,y_train)
y_prediction = SVC_model.predict(x_test)  

In [None]:
y_prediction = SVC_model.predict(x_test) # save predictions in y_prediction

In [None]:
# used confusion_matrix library for this

from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test,y_prediction)

In [None]:
sns.heatmap(cm,annot=True) # to visualise the result plotted in the confusion matrix

In [None]:
# these are the necessary libries to visualise the resuls from above into a scatter graph
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
style.use("ggplot")
from sklearn import svm

In [None]:
plt.scatter(y_test,y_prediction)
plt.show()

In [None]:
y_test # this is the x-axis 

In [None]:
y_prediction # this is the y axis 

In [None]:
# the SVC model correctly identified 6 out of 11

Data Analysis using SVC and KNN (C) 17223-SS025, (2022). All rights reserved.