In [None]:
from scipy.io import arff
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [None]:
#Loading the dataset
data=arff.loadarff('C://Users//Athira Shankar//Downloads//Autism//Autism-Adult-Data.arff')

In [None]:
df=pd.DataFrame(data[0])

In [None]:
df

In [None]:
#Info on the dataset
df.info()

In [None]:
df.columns

# Data Cleaning

In [None]:
#Function to convert bytes into int
def convert_to_int(name):
    df[name]=df[name].str.decode("utf-8")
    df[name]=df[name].astype('int')
    

In [None]:
column_names=['A1_Score','A2_Score','A3_Score','A4_Score','A5_Score','A6_Score','A7_Score','A8_Score','A9_Score','A10_Score'
             ]

In [None]:
for i in column_names:
    convert_to_int(i)

In [None]:
df.head()

In [None]:
#Function to convert string and boolean
def convert_to_string(name):
    df[name]=df[name].str.decode("utf-8")

In [None]:
cols=['gender','ethnicity','jundice','austim','contry_of_res','used_app_before','age_desc','relation','Class/ASD']

In [None]:
for i in cols:
    convert_to_string(i)

In [None]:
df.head(10)

In [None]:
type(df['result'][1])

In [None]:
#Missing values information
df.isna().sum()

In [None]:
df['age_desc'].value_counts()

In [None]:
df['age'].mean()

In [None]:
sns.boxplot(df['age'])

In [None]:
index_name=df[df['age']>300].index

In [None]:
df[df['age']>300]

In [None]:
df.drop(index_name,inplace=True)

In [None]:
sns.boxplot(df['age'])

In [None]:
df['age'].median()

In [None]:
df['age'].mode()

In [None]:
df['age'].mean()

In [None]:
df['age'].median()

In [None]:
#Filling the missing values
df['age'].fillna(df['age'].mean(),inplace=True)

In [None]:
df['age'].isna().sum()

In [None]:
df['relation'].value_counts()

In [None]:
df['result'].value_counts()

# Data Exploration

In [None]:
#Code to plot histogram
n, bins, patches = plt.hist(df['result'], bins=[0,1,2,3,4,5,6,7,8,9,10], color='#0504aa',
                            alpha=0.7, rwidth=0.85)
plt.grid(axis='y', alpha=0.75)
plt.xlabel('Resultant screening test scores')
plt.xticks(np.arange(0,11,1))
plt.ylabel('Count')
plt.title('Histogram of attribute result')
plt.text(23, 45, r'$\mu=15, b=3$')
maxfreq = n.max()
# Set a clean upper y-axis limit.
plt.ylim(ymax=np.ceil(maxfreq / 10) * 10 if maxfreq % 10 else maxfreq + 10)


In [None]:
n, bins, patches = plt.hist(df['A2_Score'], bins='auto', color='#0504aa')
plt.grid(axis='y', alpha=0.75)
plt.xlabel(' A1_Score')
plt.xticks(np.arange(0,2,1))
plt.ylabel('Count')
plt.title('Histogram of A1_Score')
#plt.text(23, 45, r'$\mu=15, b=3$')
#maxfreq = n.max()
# Set a clean upper y-axis limit.
#plt.ylim(ymax=np.ceil(maxfreq / 10) * 10 if maxfreq % 10 else maxfreq + 10)


In [None]:
#Pmf 
probabilities=df['result'].value_counts(normalize=True)
ax=sns.barplot(probabilities.index,probabilities.values)
ax.set(xlabel='result', ylabel='Probability')
ax.set_title("Probability Mass Function")

In [None]:
probabilities=df['A1_Score'].value_counts(normalize=True)
ax=sns.barplot(probabilities.index,probabilities.values)
ax.set(xlabel='A1_score', ylabel='Probability')
ax.set_title("Probability Mass Function")
widthbars = [0.2, 0.2]
for bar, newwidth in zip(ax.patches, widthbars):
    x = bar.get_x()
    width = bar.get_width()
    centre = x + width/2.
    bar.set_x(centre - newwidth/2.)
    bar.set_width(newwidth)

In [None]:
df['result'].mean()

In [None]:
df['result'].mode()

In [None]:
df['result'].std()

In [None]:
df['result'].skew()

In [None]:
df['result'].kurtosis()

In [None]:
import numpy as np
from scipy.stats import norm
import matplotlib.pyplot as plt


# Generate some data for this demonstration.
#data = norm.rvs(10.0, 2.5, size=500)

# Fit a normal distribution to the data:
mu, std = norm.fit(df['result'])

# Plot the histogram.
plt.(df['result'], bins=25, density=True, alpha=0.6, color='g')

# Plot the PDF.
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, std)
plt.plot(x, p, 'k', linewidth=2)
title = "Fit results: mu = %.2f,  std = %.2f" % (mu, std)
plt.title(title)

plt.show()

In [None]:
temp=pd.DataFrame()

In [None]:
temp['A1_Score']=df['A1_Score']

In [None]:
temp['A2_Score']=df['A2_Score']

In [None]:
temp['A3_Score']=df['A3_Score']

In [None]:
temp['A4_Score']=df['A4_Score']

In [None]:
temp['A5_Score']=df['A5_Score']

In [None]:
temp['A6_Score']=df['A6_Score']

In [None]:
temp['A7_Score']=df['A7_Score']

In [None]:
temp['A8_Score']=df['A8_Score']

In [None]:
temp['A9_Score']=df['A9_Score']

In [None]:
temp['A10_Score']=df['A10_Score']

In [None]:
temp['result']=df['result']

In [None]:
#Code for scatter plot matrix
axes=scatter_matrix(temp, alpha = 0.2, figsize = (20, 20), diagonal = 'kde')
corr=temp.corr().values
for i,j in zip(*plt.np.triu_indices_from(axes,k=1)):
    axes[i,j].annotate("%.3f" %corr[i,j],(0.8,0.8),xycoords='axes fraction',ha='center',va='center')
plt.show()

In [None]:
sns.countplot('result',hue='gender',data=df)

In [None]:
ax=sns.countplot('result',hue='Autism',data=df)
ax.set_title("Result distribution with hue of attribute Autism")

In [None]:
ax=sns.countplot('result',hue='Jaundice',data=df)
ax.set_title("The result distribution with hue of Jaundice")

In [None]:
sns.countplot('result',hue='ethnicity',data=df)

In [None]:
df.rename(columns={"jundice": "Jaundice", "austim": "Autism"},inplace=True)

In [None]:
df1=df.groupby(by='Jaundice').agg({'result':'mean'}).reset_index()

In [None]:
df1

In [None]:
sns.barplot(x=df1['Jaundice'],y=df1['result'])

In [None]:
df['Jaundice'].value_counts(normalize=True)

In [None]:
#Code for heatmap
corr_matrix = df.corr()

mask=np.triu(np.ones_like(corr_matrix,dtype=np.bool))

plt.figure(figsize=(20,10))
sns.heatmap(corr_matrix,annot=True, fmt=".3f",
            vmin=-1, vmax=1, linewidth = 1,
            center=0, mask=mask,cmap="RdBu_r");