## **IMPORT LIBRARY**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import plot_model

## **LOAD DATA**

In [None]:
df = pd.read_csv("/content/autism_screening.csv")
df.head()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,...,gender,ethnicity,jundice,austim,contry_of_res,used_app_before,result,age_desc,relation,Class/ASD
0,1,1,1,1,0,0,1,1,0,0,...,f,White-European,no,no,United States,no,6.0,18 and more,Self,NO
1,1,1,0,1,0,0,0,1,0,1,...,m,Latino,no,yes,Brazil,no,5.0,18 and more,Self,NO
2,1,1,0,1,1,0,1,1,1,1,...,m,Latino,yes,yes,Spain,no,8.0,18 and more,Parent,YES
3,1,1,0,1,0,0,1,1,0,1,...,f,White-European,no,yes,United States,no,6.0,18 and more,Self,NO
4,1,0,0,0,0,0,0,1,0,0,...,f,?,no,no,Egypt,no,2.0,18 and more,?,NO


In [None]:
print(f'Shape of dataframe is: {df.shape}')

Shape of dataframe is: (704, 21)


## **ANALISA STASTIK**

In [None]:
df.describe()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,age,result
count,704.0,704.0,704.0,704.0,704.0,704.0,704.0,704.0,704.0,704.0,702.0,704.0
mean,0.721591,0.453125,0.457386,0.495739,0.49858,0.284091,0.417614,0.649148,0.323864,0.573864,29.698006,4.875
std,0.448535,0.498152,0.498535,0.500337,0.500353,0.451301,0.493516,0.477576,0.468281,0.494866,16.507465,2.501493
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0,3.0
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,27.0,4.0
75%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,35.0,7.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,383.0,10.0


## **PREPROCESSING**

Check missing Values

In [None]:
pd.DataFrame(df.isnull().sum(), columns=["Missing Values"]).style.bar(color = "#84A9AC")

Unnamed: 0,Missing Values
A1_Score,0
A2_Score,0
A3_Score,0
A4_Score,0
A5_Score,0
A6_Score,0
A7_Score,0
A8_Score,0
A9_Score,0
A10_Score,0


Cek Data yang ada missing value

In [None]:
print(f"Maximum age is data: {df['age'].max()}\n")

print(f"Minimum age is data: {df['age'].min()}")

Maximum age is data: 383.0

Minimum age is data: 17.0


Remove outlier dan Change missing values

In [None]:
# dropping record number 52
df.drop(index = 52, inplace = True)

# resetting index
df.reset_index(inplace = True)

df['age'] = df['age'].fillna(np.round(df['age'].mean(), 0))

pd.DataFrame(df.isnull().sum(), columns=["Missing Values"])

Unnamed: 0,Missing Values
index,0
A1_Score,0
A2_Score,0
A3_Score,0
A4_Score,0
A5_Score,0
A6_Score,0
A7_Score,0
A8_Score,0
A9_Score,0


Mengecek Unique Values pada Data yang bertipe Object


In [None]:
for col in df.select_dtypes('O').columns:
    print("-------------------------------")
    print(f'Column name: {col}\n')
    print(f'Unique values:\n{df[col].unique()}\n\n')

-------------------------------
Column name: gender

Unique values:
['f' 'm']


-------------------------------
Column name: ethnicity

Unique values:
['White-European' 'Latino' '?' 'Others' 'Black' 'Asian' 'Middle Eastern '
 'Pasifika' 'South Asian' 'Hispanic' 'Turkish' 'others']


-------------------------------
Column name: jundice

Unique values:
['no' 'yes']


-------------------------------
Column name: austim

Unique values:
['no' 'yes']


-------------------------------
Column name: contry_of_res

Unique values:
['United States' 'Brazil' 'Spain' 'Egypt' 'New Zealand' 'Bahamas'
 'Burundi' 'Austria' 'Argentina' 'Jordan' 'Ireland' 'United Arab Emirates'
 'Afghanistan' 'Lebanon' 'United Kingdom' 'South Africa' 'Italy'
 'Pakistan' 'Bangladesh' 'Chile' 'France' 'China' 'Australia' 'Canada'
 'Saudi Arabia' 'Netherlands' 'Romania' 'Sweden' 'Tonga' 'Oman' 'India'
 'Philippines' 'Sri Lanka' 'Sierra Leone' 'Ethiopia' 'Viet Nam' 'Iran'
 'Costa Rica' 'Germany' 'Mexico' 'Russia' 'Armenia' 'I

Mengganti Unique Values pada Kolom Ethnicity

In [None]:
df['ethnicity'] = df['ethnicity'].replace('?', 'Others')
df['ethnicity'] = df['ethnicity'].replace('others', 'Others')

df['ethnicity'].unique()

array(['White-European', 'Latino', 'Others', 'Black', 'Asian',
       'Middle Eastern ', 'Pasifika', 'South Asian', 'Hispanic',
       'Turkish'], dtype=object)

Mengganti Unique Values pada Kolom Relation

In [None]:
df['relation'] = df['relation'].replace('?', df['relation'].mode()[0])
df['relation'].unique()

array(['Self', 'Parent', 'Health care professional', 'Relative', 'Others'],
      dtype=object)

## **VISUALISASI DATA YANG SUDAH DI PREPROCESSING**

In [None]:
fig = px.histogram(df, x="gender",
                   template='plotly_dark',
                   color_discrete_sequence = ["#84A9AC"])

fig.update_layout(title = "<b>Counts of Male and Female</b>",
                  title_x = 0.5,
                  title_font = dict(size = 20),
                  uniformtext_minsize = 15)

fig.show()

In [None]:
df['Class/ASD'].value_counts()

NO     514
YES    189
Name: Class/ASD, dtype: int64

In [None]:
asd_patients_country_wise = pd.DataFrame(df[df['Class/ASD'] == "YES"]['contry_of_res'].value_counts()).rename({"contry_of_res":"ASD_Patient_Counts"}, axis = 1)
asd_patients_country_wise.style.bar(color="#84A9AC")

Unnamed: 0,ASD_Patient_Counts
United States,53
United Kingdom,29
New Zealand,15
Australia,12
Canada,10
India,6
France,5
Brazil,5
Malaysia,4
Mexico,4


In [None]:
fig = px.bar(data_frame = asd_patients_country_wise,
             x = asd_patients_country_wise.index,
             y = "ASD_Patient_Counts",
             labels = {"index" : "Country"},
             color_discrete_sequence = px.colors.qualitative.D3_r,
             template='plotly_dark')

fig.update_xaxes(tickangle = 310)

fig.update_layout(title={
        'text': "<b>Counts of ASD Patients Country Wise</b>",
        'y':0.93,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})

fig.show()

## **MODELING DAN DATA TRAINING**

Drop kolom yang tidak penting

In [None]:
df.drop(['index','age_desc'], axis = 1, inplace = True)

Memisahkan variable dependent dan independent

In [None]:
X = df.drop("Class/ASD", axis = 1)    # select all other feature except "Class/ASD" for training
Y = df['Class/ASD']

In [None]:
X = pd.get_dummies(X)
Y = pd.get_dummies(Y)

Train Test Split

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25)
print(f"Shape of X_train is: {X_train.shape}")
print(f"Shape of Y_train is: {Y_train.shape}\n")
print(f"Shape of X_test is: {X_test.shape}")
print(f"Shape of Y_test is: {Y_test.shape}")

Shape of X_train is: (527, 102)
Shape of Y_train is: (527, 2)

Shape of X_test is: (176, 102)
Shape of Y_test is: (176, 2)


Membuat Model

In [None]:
input_dim = X.shape[1]
model = Sequential()
model.add(Dense(8, input_dim = input_dim, kernel_initializer='normal', activation='relu'))
model.add(Dense(5, activation = "relu", kernel_initializer='normal'))
model.add(Dense(2, activation = 'sigmoid'))

model.compile(optimizer = Adam(learning_rate = 0.001),
              loss = 'binary_crossentropy',
              metrics = ['accuracy'])

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 8)                 824       
                                                                 
 dense_4 (Dense)             (None, 5)                 45        
                                                                 
 dense_5 (Dense)             (None, 2)                 12        
                                                                 
Total params: 881
Trainable params: 881
Non-trainable params: 0
_________________________________________________________________


Training Data

In [None]:
result = model.fit(X_train, Y_train, epochs = 20, batch_size = 10)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
acc = result.history['accuracy']
loss = result.history['loss']

epoch = [i + 1 for i in range(len(acc))]
acc_loss_df = pd.DataFrame({"Accuracy" : acc,
                            "Loss" : loss,
                            "Epoch" : epoch})

acc_loss_df.style.bar(color = '#84A9AC',
                      subset = ['Accuracy','Loss'])

Unnamed: 0,Accuracy,Loss,Epoch
0,0.736243,0.651891,1
1,0.736243,0.596549,2
2,0.736243,0.550868,3
3,0.768501,0.487653,4
4,0.836812,0.422059,5
5,0.891841,0.358073,6
6,0.903226,0.309421,7
7,0.929791,0.266719,8
8,0.937381,0.234079,9
9,0.946869,0.21007,10


## **TEST RESULT AND EVALUATION**

In [None]:
test_results = model.evaluate(X_test, Y_test)
print("The model test accuracy is {}.".format(test_results[1]))

The model test accuracy is 0.9147727489471436.


In [None]:
prediction = model.predict(X_test)
prediction = np.argmax(prediction, axis = 1)
print(classification_report(Y_test[['YES']], prediction))

              precision    recall  f1-score   support

           0       0.92      0.96      0.94       126
           1       0.89      0.80      0.84        50

    accuracy                           0.91       176
   macro avg       0.91      0.88      0.89       176
weighted avg       0.91      0.91      0.91       176

