In [None]:
from scipy.io import arff
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegressionCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection

%matplotlib inline

In [None]:
import os
print(os.getcwd())

data = arff.loadarff('C:/Users/Alex JS/Autism-Adult-Data.arff')
df = pd.DataFrame(data[0])   # convert to pandas df
df.head()


In [None]:
df['A1_Score'] = df['A1_Score'].astype(int)  # Convert column data types
df['A2_Score'] = df['A2_Score'].astype(int)
df['A3_Score'] = df['A3_Score'].astype(int)
df['A4_Score'] = df['A4_Score'].astype(int)
df['A5_Score'] = df['A5_Score'].astype(int)
df['A6_Score'] = df['A6_Score'].astype(int)
df['A7_Score'] = df['A7_Score'].astype(int)
df['A8_Score'] = df['A8_Score'].astype(int)
df['A9_Score'] = df['A9_Score'].astype(int)
df['A10_Score'] = df['A10_Score'].astype(int)
df['result'] = df['result'].astype(int)

df['gender'] = df['gender'].str.decode('utf-8')
df['ethnicity'] = df['ethnicity'].str.decode('utf-8')
df['jundice'] = df['jundice'].str.decode('utf-8')
df['austim'] = df['austim'].str.decode('utf-8')
df['contry_of_res'] = df['contry_of_res'].str.decode('utf-8')
df['used_app_before'] = df['used_app_before'].str.decode('utf-8')
df['age_desc'] = df['age_desc'].str.decode('utf-8')
df['relation'] = df['relation'].str.decode('utf-8')
df['Class/ASD'] = df['Class/ASD'].str.decode('utf-8')
df.head()

In [None]:
df.info()   # check datatypes

In [None]:
df.describe()

In [None]:
df.replace("?",np.nan, inplace=True)   # replace ? with null
missing_data = df.isnull().sum().sort_values(ascending=False)
percent_missing = (df.isnull().sum() / df.isnull().count()*100).sort_values(ascending=False)
m_data = pd.concat([missing_data, percent_missing], axis=1,keys=['Total', 'Percent'])
m_data.head()

In [None]:
df.age.max(), df.age.min()    # max and min age

In [None]:
df = df[df.age != 383]     # remove person aged 383.

In [None]:
df.age.max(), df.age.min()

In [None]:
df['age'].fillna((df['age'].mean()),inplace=True)  # replace null age fields with average age

In [None]:
missing_data = df.isnull().sum().sort_values(ascending=False)
percent_missing = (df.isnull().sum() / df.isnull().count()*100).sort_values(ascending=False) # calculate total and percentage missing data
m_data = pd.concat([missing_data, percent_missing], axis=1,keys=['Total', 'Percent'])
m_data.head()

In [None]:
df.columns = map(lambda x: x.strip().lower(), df.columns) # convert columns to lower

In [None]:
df.jundice = df.jundice.apply(lambda x: 0 if x == 'no' else 1) # Covert yes/no into 1/0
df.austim = df.austim.apply(lambda x: 0 if x == 'no' else 1)
df.used_app_before = df.used_app_before.apply(lambda x: 0 if x == 'no' else 1)
df.rename(columns={'class/asd': 'classification'}, inplace=True)
df.gender = df.gender.apply(lambda x: 0 if x == 'f' else 1)   # make Females 0 Males 1

In [None]:
df.classification = df.classification.apply(lambda x: 0 if x == 'NO' else 1) # change autism class NO - 0, Yes - 1
df.head()

In [None]:
# Cleaning categorical data 
df.contry_of_res = df.contry_of_res.str.replace("'", "")
df.contry_of_res = df.contry_of_res.str.strip()
df.relation = df.relation.str.replace("'", "")
df.relation = df.relation.str.strip()
df.ethnicity = df.ethnicity.str.replace("'", "")
df.ethnicity = df.ethnicity.str.strip()

In [None]:
df.gender[df.classification == 0].value_counts()  ## males (1) vs females (0) without ASD

In [None]:
df.gender[df.classification == 1].value_counts()  # Males(1) vs females(0) with ASD

In [None]:
df.ethnicity[df.classification == 0].value_counts().plot(kind='bar')  # bar plot of ethnicities without ASD

In [None]:
df.ethnicity[df.classification == 1].value_counts().plot(kind='bar')   # Bar plot of ethnicities with ASD

In [None]:
fig = plt.figure()
ax = plt.subplot(111)
title = 'Countries with most people classified not having ASD'
df.contry_of_res[df.classification == 0].value_counts().plot(kind='bar', ax=ax, title=title) # bar plot of number of those in each country without asd
ax.set_xlim(1, 10)


In [None]:
fig = plt.figure()
ax = plt.subplot(111)
title = 'Countries with most people classified as having ASD'

df.contry_of_res[df.classification == 1].value_counts().plot(kind='bar', ax=ax, title=title) #bar plot showing number of people in countries with ASD
ax.set_xlim(1,10)

In [None]:
df.drop('relation', axis=1, inplace=True)   # drop categorical data
df.drop('age_desc', axis=1, inplace=True)
df.drop('contry_of_res', axis=1, inplace=True)
df.drop('ethnicity', axis=1, inplace=True)

In [None]:
labelEnc = LabelEncoder()
columns = [ 'gender', 'age', 'classification', 'jundice']
for cols in columns:
    df[cols] = labelEnc.fit_transform(df[cols])
    

In [None]:
x = df.drop(['classification'], axis = 1)
y = df['classification']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.5)   # split data

In [None]:
classifiers = []    # classifiers used
classifiers.append(('Logistic Regression', LogisticRegressionCV(cv=10, max_iter=2000)))
classifiers.append(('Decision Trees', DecisionTreeClassifier()))
classifiers.append(('Random Forests', RandomForestClassifier(n_estimators=5)))
classifiers.append(('K Neighbours', KNeighborsClassifier(n_neighbors=5)))
classifiers.append(('SVM', SVC(kernel='linear')))

In [None]:
classes = ["No", "Yes"]
results = []
for name, classifier in classifiers:  # loop through the classifiers
    kfold = model_selection.KFold(n_splits=5)      # k fold with 5 splits
    result = model_selection.cross_val_score(classifier, x, y, cv=kfold, scoring='accuracy') # get cross validation score with k fold for each classifier
    classifier.fit(x_train, y_train)   # fit the data to classifier
    pred = classifier.predict(x_test)   # make a predicition
    acc_score = accuracy_score(y_test, pred)   # get the accuracy score
    results.append(result)  # add cross val result to results list.
    print(name)
    print('Accuracy Score: ', acc_score)   # print accuracy score
    print('Recall Score: ', recall_score(y_test, pred))  # print and get recall score
    print('F1 Score: ', f1_score(y_test, pred))  # print and get f1 score
    print(classification_report(y_test, pred, target_names=classes)) #  print and get classification report
    print(confusion_matrix(y_test, pred)) # print and get confusion matrix
    print("------------------------------------------------------------------------------") # seperator
    
    

In [None]:
names = ['LR', 'DT', 'RF', 'KNN', 'SVM']
fig = plt.figure()
fig.suptitle('Classifier comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)   # plot results
ax.set_xticklabels(names)
plt.show

In [None]:
svc = SVC()
parameters = {
    'C': [0.1,0.3,0.4,0.6,1.1,1.2,1.3,1.4, 1.9],
    'gamma' : [0.1,0.8,0.9,1,1.1,1.2,1.3,1.4]
}

gsc = GridSearchCV(svc, param_grid = parameters, scoring = 'accuracy', cv= 10)
gsc.fit(x_train, y_train)
gsc.best_params_

In [None]:
svc2 = SVC(C = 1.1, gamma = 0.1, kernel = 'linear')
svc2.fit(x_train, y_train)
prediction = svc2.predict(x_test)
print(accuracy_score(y_test, pred))

In [None]:
df2 = df
df2.drop('age', axis=1, inplace=True)    # drop all individual features
df2.drop('gender', axis=1, inplace=True)
df2.drop('jundice', axis=1, inplace=True)
df2.drop('austim', axis=1, inplace=True)
df2.drop('used_app_before', axis=1, inplace=True)
df2.drop('result', axis=1, inplace=True)

In [None]:
x = df2.drop(['classification'], axis = 1)
y = df2['classification']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.5)  # create new training sets

In [None]:
results = []
classes = ["No", "Yes"]
for name, classifier in classifiers:  # loops through classifiers and applies same as above.
    kfold = model_selection.KFold(n_splits=5)
    result = model_selection.cross_val_score(classifier, x, y, cv=kfold, scoring='accuracy')
    results.append(result)
    classifier.fit(x_train, y_train)
    pred = classifier.predict(x_test)
    print(name)
    print('Accuracy Score: ', accuracy_score(y_test, pred))
    print('Recall Score: ', recall_score(y_test, pred))
    print('F1 Score: ', f1_score(y_test, pred))
    print(classification_report(y_test, pred, target_names=classes))
    print(confusion_matrix(y_test, pred))
    print("------------------------------------------------------------------------------")
    

In [None]:
names = ['LR', 'DT', 'RF', 'KNN', 'SVM']
fig = plt.figure()
fig.suptitle('Classifier comparison')
ax = fig.add_subplot(111) 
plt.boxplot(results)  # plot results
ax.set_xticklabels(names)
plt.show
