In [1]:
# Necessary imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Supress warnings
import warnings
warnings.filterwarnings("ignore")

# Classification
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis , QuadraticDiscriminantAnalysis
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

In [2]:
dataset = pd.read_csv("Toddler Autism dataset July 2018.csv")

In [3]:
dataset.head()

Unnamed: 0,Case_No,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,Age_Mons,Qchat-10-Score,Sex,Ethnicity,Jaundice,Family_mem_with_ASD,Who completed the test,Class/ASD Traits
0,1,0,0,0,0,0,0,1,1,0,1,28,3,f,middle eastern,yes,no,family member,No
1,2,1,1,0,0,0,1,1,0,0,0,36,4,m,White European,yes,no,family member,Yes
2,3,1,0,0,0,0,0,1,1,0,1,36,4,m,middle eastern,yes,no,family member,Yes
3,4,1,1,1,1,1,1,1,1,1,1,24,10,m,Hispanic,no,no,family member,Yes
4,5,1,1,0,1,1,1,1,1,1,1,20,9,f,White European,no,yes,family member,Yes


In [4]:
df = dataset.drop(["Case_No","Qchat-10-Score"], axis=1)

In [5]:
df.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,Age_Mons,Sex,Ethnicity,Jaundice,Family_mem_with_ASD,Who completed the test,Class/ASD Traits
0,0,0,0,0,0,0,1,1,0,1,28,f,middle eastern,yes,no,family member,No
1,1,1,0,0,0,1,1,0,0,0,36,m,White European,yes,no,family member,Yes
2,1,0,0,0,0,0,1,1,0,1,36,m,middle eastern,yes,no,family member,Yes
3,1,1,1,1,1,1,1,1,1,1,24,m,Hispanic,no,no,family member,Yes
4,1,1,0,1,1,1,1,1,1,1,20,f,White European,no,yes,family member,Yes


In [6]:
df = df.dropna()

In [7]:
df.shape

(1054, 17)

In [8]:
# Printing out unique values for each column
for column_name in df.columns:
    print(
        """
    {column_name}:
    {unique_values}""".format(
            column_name=column_name,
            unique_values=", ".join(
                map(str, df[column_name].unique())
            ),
        )
    )
    
print(
    """
NUMBER OF EXAMPLES:{}
NUMBER OF COLUMNS: {}
""".format(
        df.shape[0], df.shape[1]
    )
)


    A1:
    0, 1

    A2:
    0, 1

    A3:
    0, 1

    A4:
    0, 1

    A5:
    0, 1

    A6:
    0, 1

    A7:
    1, 0

    A8:
    1, 0

    A9:
    0, 1

    A10:
    1, 0

    Age_Mons:
    28, 36, 24, 20, 21, 33, 22, 17, 25, 15, 18, 12, 29, 35, 32, 19, 14, 13, 30, 23, 34, 26, 31, 27, 16

    Sex:
    f, m

    Ethnicity:
    middle eastern, White European, Hispanic, black, asian, south asian, Native Indian, Others, Latino, mixed, Pacifica

    Jaundice:
    yes, no

    Family_mem_with_ASD:
    no, yes

    Who completed the test:
    family member, Health Care Professional, Health care professional, Self, Others

    Class/ASD Traits :
    No, Yes

NUMBER OF EXAMPLES:1054
NUMBER OF COLUMNS: 17



In [9]:
df.columns

Index(['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'Age_Mons',
       'Sex', 'Ethnicity', 'Jaundice', 'Family_mem_with_ASD',
       'Who completed the test', 'Class/ASD Traits '],
      dtype='object')

In [10]:
# Some columns need to be labeled and some need to be one hot encoded while some are in goodshape
# for our prediction
org_data = df[
    [
        "A1",
        "A2",
        "A3",
        "A4",
        "A5",
        "A6",
        "A7",
        "A8",
        "A9",
        "A10",
        "Age_Mons",
    ]
]

In [11]:
label_data = df[
    [
        "Sex",
        "Jaundice",
        "Family_mem_with_ASD",
        "Class/ASD Traits ",
    ]
]

In [12]:
one_hot_encoded_data = df[
    ["Ethnicity", "Who completed the test"]
]

In [13]:
for column in label_data.columns:
    label_data[column] = label_data[column].apply(lambda x:1 if x=="yes" or x=="Yes" or x=="m" else 0)

In [14]:
one_hot_encoded_data = pd.get_dummies(one_hot_encoded_data)

In [15]:
one_hot_encoded_data.head()

Unnamed: 0,Ethnicity_Hispanic,Ethnicity_Latino,Ethnicity_Native Indian,Ethnicity_Others,Ethnicity_Pacifica,Ethnicity_White European,Ethnicity_asian,Ethnicity_black,Ethnicity_middle eastern,Ethnicity_mixed,Ethnicity_south asian,Who completed the test_Health Care Professional,Who completed the test_Health care professional,Who completed the test_Others,Who completed the test_Self,Who completed the test_family member
0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1


In [16]:
final_data = pd.concat([org_data, label_data, one_hot_encoded_data], axis=1)

In [17]:
final_data.columns

Index(['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'Age_Mons',
       'Sex', 'Jaundice', 'Family_mem_with_ASD', 'Class/ASD Traits ',
       'Ethnicity_Hispanic', 'Ethnicity_Latino', 'Ethnicity_Native Indian',
       'Ethnicity_Others', 'Ethnicity_Pacifica', 'Ethnicity_White European',
       'Ethnicity_asian', 'Ethnicity_black', 'Ethnicity_middle eastern',
       'Ethnicity_mixed', 'Ethnicity_south asian',
       'Who completed the test_Health Care Professional',
       'Who completed the test_Health care professional',
       'Who completed the test_Others', 'Who completed the test_Self',
       'Who completed the test_family member'],
      dtype='object')

In [18]:
X = final_data.drop("Class/ASD Traits ", axis=1)
y = final_data["Class/ASD Traits "]

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=101)

In [20]:
X_train.shape

(706, 30)

In [21]:
X_test.shape

(348, 30)

In [22]:
y_train.head()

975     1
996     0
958     0
1032    1
473     0
Name: Class/ASD Traits , dtype: int64

In [23]:
y_train.unique()
y_test.unique()

array([1, 0], dtype=int64)

In [35]:
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
models.append(("Random Forest", RandomForestClassifier()))
models.append(("XGB", XGBClassifier()))

for name, model in models:
    model.fit(X_train, y_train)
    pred = model.predict(X_test).astype(int)
    print(name, accuracy_score(y_test, pred))

LR 1.0
LDA 0.9482758620689655
KNN 0.9022988505747126
CART 0.8850574712643678
NB 0.6695402298850575
SVM 0.7614942528735632
Random Forest 0.9511494252873564
XGB 0.9741379310344828


In [36]:
from sklearn.ensemble import VotingClassifier

In [37]:
classifier = VotingClassifier(models, voting="hard")

In [38]:
classifier.named_estimators

{'LR': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=100,
                    multi_class='auto', n_jobs=None, penalty='l2',
                    random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                    warm_start=False),
 'LDA': LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
                            solver='svd', store_covariance=False, tol=0.0001),
 'KNN': KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                      metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                      weights='uniform'),
 'CART': DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=None, max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=

In [39]:
classifier.fit(X_train, y_train)

VotingClassifier(estimators=[('LR',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('LDA',
                              LinearDiscriminantAnalysis(n_components=None,
                                                         priors=None,
                                                         shrinkage=None...
                                

In [40]:
pred = classifier.predict(X_test)

In [41]:
accuracy_score(y_test, pred)

0.9597701149425287

In [44]:
import pickle
file = "model.sav"
pickle.dump(classifier, open(file, "wb"))

In [47]:
loaded = pickle.load(open("model.sav", "rb"))