<a href="https://colab.research.google.com/github/DMShino/DFEDATA5-Project/blob/main/DManning_DFEData5_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Autism Spectrum Disorder (ADS) is a development disorder that can greatly impact the lives of those who it affects, however it can often go undiagnosed, leading to many difficulties for those who are unaware they are affected by it.
This dataset comes from a mobile app used for screening (www.asdtests.com) which gathers data from users by having them answer 10 behavioural questions and providing responses for 10 other individual characteristics. This data is then to be analysed to help determine which traits and characteristics are most prevelant to increase the likelihood of providing accurate, and earlier diagnoses.

In [6]:
#import pandas and numpy modules
import pandas as pd
import numpy as np

#read the dataset from the csv file and assign it to a variable
data = pd.read_csv("https://raw.githubusercontent.com/DMShino/DFEDATA5-Project/main/Autism_Data.csv")
#display the dataset
data

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,age,gender,ethnicity,jundice,austim,contry_of_res,used_app_before,result,age_desc,relation,Class/ASD
0,1,1,1,1,0,0,1,1,0,0,26,f,White-European,no,no,'United States',no,6,'18 and more',Self,NO
1,1,1,0,1,0,0,0,1,0,1,24,m,Latino,no,yes,Brazil,no,5,'18 and more',Self,NO
2,1,1,0,1,1,0,1,1,1,1,27,m,Latino,yes,yes,Spain,no,8,'18 and more',Parent,YES
3,1,1,0,1,0,0,1,1,0,1,35,f,White-European,no,yes,'United States',no,6,'18 and more',Self,NO
4,1,0,0,0,0,0,0,1,0,0,40,f,?,no,no,Egypt,no,2,'18 and more',?,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
699,0,1,0,1,1,0,1,1,1,1,25,f,White-European,no,no,Russia,no,7,'18 and more',Self,YES
700,1,0,0,0,0,0,0,1,0,1,34,m,Hispanic,no,no,Mexico,no,3,'18 and more',Parent,NO
701,1,0,1,1,1,0,1,1,0,1,24,f,?,no,no,Russia,no,7,'18 and more',?,YES
702,1,0,0,1,1,0,1,0,1,1,35,m,'South Asian',no,no,Pakistan,no,6,'18 and more',Self,NO


I will use the data from this dataset to answer the following questions using machine learning:

Does the presence of a close family member having ASD correlate to a diagnosis of ASD in the user?

Does the presence of a jaundice trait correlate to diagnosis of ASD in the user?

Do those who score higher in the behavioural questions have a higher likelihood of having been diagnosed with ASD?

In [7]:
#Cleaning of data by correcting column title spelling
data = data.rename(columns={"jundice":"jaundice","austim":"autism","contry_of_res":"country_of_res","Class/ASD":"ASD"})

In [8]:
#Dropping the following columns as they contain information that is either not usable for analysis based on the dataset (A1-10 score, as the questions themselves are not defined)
#or are not applicable to the study (has used app before, or age description, relation)
data.drop(["used_app_before", "A1_Score", "A2_Score", "A3_Score", "A4_Score", "A5_Score", "A6_Score", "A7_Score", "A8_Score", "A9_Score", "A10_Score", "age_desc", "relation"], axis=1, inplace=True)

In [9]:
data = data[['age','gender','ethnicity','country_of_res','jaundice','autism','result','ASD']]

In [10]:
#check the data after the columns have been renamed or removed
data.head()

Unnamed: 0,age,gender,ethnicity,country_of_res,jaundice,autism,result,ASD
0,26,f,White-European,'United States',no,no,6,NO
1,24,m,Latino,Brazil,no,yes,5,NO
2,27,m,Latino,Spain,yes,yes,8,YES
3,35,f,White-European,'United States',no,yes,6,NO
4,40,f,?,Egypt,no,no,2,NO


In [11]:
#As the dataset uses '?' to represent missing data, replace it with null value, and convert yes/no values to boolean values
data = data.replace({'?': np.nan, 'no': 0, 'NO': 0, 'yes': 1, 'YES': 1})
data.head()

Unnamed: 0,age,gender,ethnicity,country_of_res,jaundice,autism,result,ASD
0,26,f,White-European,'United States',0,0,6,0
1,24,m,Latino,Brazil,0,1,5,0
2,27,m,Latino,Spain,1,1,8,1
3,35,f,White-European,'United States',0,1,6,0
4,40,f,,Egypt,0,0,2,0


In [12]:
#View the types of data stored and check for null values in key columns
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 704 entries, 0 to 703
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             702 non-null    object
 1   gender          704 non-null    object
 2   ethnicity       609 non-null    object
 3   country_of_res  704 non-null    object
 4   jaundice        704 non-null    int64 
 5   autism          704 non-null    int64 
 6   result          704 non-null    int64 
 7   ASD             704 non-null    int64 
dtypes: int64(4), object(4)
memory usage: 44.1+ KB


In [13]:
#As age is a value usable in other comparisons or questions and has null values, remove the responses that don't include that data
data = data.dropna(subset=['age'])
data = data.dropna(subset=['ethnicity'])
#The age column is still trated as an object based column due to the previous non-numeric values, convert it to integer so that statistical
#information can be retrieved from the data and confirm the change
data = data.astype({"age": int})
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 609 entries, 0 to 703
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             609 non-null    int64 
 1   gender          609 non-null    object
 2   ethnicity       609 non-null    object
 3   country_of_res  609 non-null    object
 4   jaundice        609 non-null    int64 
 5   autism          609 non-null    int64 
 6   result          609 non-null    int64 
 7   ASD             609 non-null    int64 
dtypes: int64(5), object(3)
memory usage: 42.8+ KB


In [14]:
#Some age responses included data that is outside of usable range, so limit it to those aged between 1-100 to avoid negatively impacting statistics
data = data.drop(data[(data.age > 100) | (data.age < 1)].index)
data

Unnamed: 0,age,gender,ethnicity,country_of_res,jaundice,autism,result,ASD
0,26,f,White-European,'United States',0,0,6,0
1,24,m,Latino,Brazil,0,1,5,0
2,27,m,Latino,Spain,1,1,8,1
3,35,f,White-European,'United States',0,1,6,0
5,36,m,Others,'United States',1,0,9,1
...,...,...,...,...,...,...,...,...
698,27,f,Pasifika,Australia,0,0,10,1
699,25,f,White-European,Russia,0,0,7,1
700,34,m,Hispanic,Mexico,0,0,3,0
702,35,m,'South Asian',Pakistan,0,0,6,0


In [15]:
#Retrieve the statistics of the numerical data
data.describe()

Unnamed: 0,age,jaundice,autism,result,ASD
count,608.0,608.0,608.0,608.0,608.0
mean,29.634868,0.097039,0.139803,5.083882,0.296053
std,9.69408,0.296255,0.347068,2.519356,0.45689
min,17.0,0.0,0.0,0.0,0.0
25%,22.0,0.0,0.0,3.0,0.0
50%,27.0,0.0,0.0,5.0,0.0
75%,35.0,0.0,0.0,7.0,1.0
max,64.0,1.0,1.0,10.0,1.0


In [56]:
#comparing ASD diagnosis vs Jaundice trait & Family member diagnosis
data['jm'] = (data['jaundice'].ne(data['ASD'])).ne(1)
data['fm'] = (data['autism'].ne(data['ASD'])).ne(1)
data = data.astype({"jm": int, "fm":int})

In [57]:
data.ASD!=0
datacheck = data.drop(data[data.ASD!=1].index)
datacheck.insert(0, 'index', range(0, 0 + len(datacheck)))
datacheck.reset_index(drop=True)
datacheck.drop(["age", "gender", "ethnicity","country_of_res"], axis=1, inplace=True)


In [61]:
from sklearn.model_selection import train_test_split
final_data = datacheck.set_index('index')
y = final_data.loc[:,['fm']]
x = final_data.iloc[:, 1:12]
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2,random_state=42)

zmean=xtrain.describe().T['mean']
zstd=xtrain.describe().T['std']
xtrain_norm=(xtrain-zmean)/zstd
xtest_norm=(xtest-zmean)/zstd

xtrain

Unnamed: 0_level_0,autism,result,ASD,jm,fm
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
117,1,10,1,1,1
65,0,8,1,0,0
67,0,9,1,1,0
31,0,7,1,0,0
12,0,10,1,0,0
...,...,...,...,...,...
71,1,7,1,0,1
106,0,7,1,0,0
14,0,10,1,0,0
92,0,7,1,0,0


In [None]:
from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier()