# 📊 Exploratory Data Analysis Report

This notebook summarizes key data insights extracted from various exploratory notebooks that support the ASD classification and risk prediction pipeline.

## 🎂 Age Distribution
The dataset is heavily skewed toward younger individuals (ages 2–10), which is consistent with early autism screening practices.

In [None]:

import os
original_dir = os.getcwd()
if os.path.basename(original_dir) == "DataCharts":
    os.chdir(os.path.dirname(original_dir))
%run dataInfo.ipynb
cleanTrain = train_df


# Create a histogram using the age column and the class/asd column
import matplotlib.pyplot as plt
import seaborn as sns


cleanTrain.head()
plt.figure(figsize=(10, 6))
sns.histplot(data=cleanTrain, x='age', hue='Class/ASD', multiple='stack', kde=False, legend=True, bins=30)
plt.title('Age Distribution by Class/ASD')
plt.xlabel('Age')
plt.ylabel('Count')
plt.legend(title='Class/ASD', labels=['yes', 'no'])
plt.show()




## 🧠 Age vs ASD Classification
We examined how classification rates vary with age and found no strong correlation between age and ASD classification.

In [None]:

import os
original_dir = os.getcwd()
if os.path.basename(original_dir) == "DataCharts":
    os.chdir(os.path.dirname(original_dir))
%run dataInfo.ipynb

import matplotlib.pyplot as plt
cleanTrain = train_df

cleanTrain = cleanTrain[(cleanTrain['age'] > 8) & (cleanTrain['age'] < 18)]
cleanTrain['age'] = cleanTrain['age'].round(0).astype(int)

age_class_counts = cleanTrain.groupby(['age', 'Class/ASD']).size().unstack(fill_value=0)
age_class_percentages = age_class_counts.div(age_class_counts.sum(axis=1), axis=0) * 100

plt.figure(figsize=(10, 6))

ax = age_class_percentages.plot(kind='bar', stacked=True, figsize=(12, 6), color=['blue', 'green'], alpha=0.7)
plt.legend(['Class/ASD = yes', 'Class/ASD = no'], title='Class/ASD')

for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy()
    if height > 0:  
        ax.text(x + width / 2, y + height / 2, f'{height:.1f}%', ha='center', va='center', fontsize=8)

plt.xlabel('Age')
plt.ylabel('Frequency')
plt.title('Age Distribution by Class/ASD')
plt.legend()
plt.tight_layout()
plt.show()



## 🚻 Gender and ASD
Slightly more males are diagnosed with ASD than females. However, this is expected due to known prevalence trends.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# Calculate counts of people with Class/ASD by gender
gender_asd = cleanTrain[cleanTrain['Class/ASD'] == 1].groupby('gender').size()

# Map gender labels
gender_labels = {1: 'Male', 0: 'Female'}
gender_asd.index = gender_asd.index.map(gender_labels)

# Plot the first graph
plt.figure(figsize=(8, 6))
bars = gender_asd.plot(kind='bar', color=['pink', 'blue'], alpha=0.7)
plt.title('Count of People with Class/ASD by Gender')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.xticks(rotation=0)

# Add values on top of bars
for bar in bars.patches:
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.5, int(bar.get_height()), ha='center', va='bottom')
plt.show()

# Calculate total counts of people with Class/ASD
total_asd = cleanTrain[cleanTrain['Class/ASD'] == 1].shape[0]
# Calculate percentage of people with Class/ASD by gender
gender_asd_percentage = (gender_asd / total_asd) * 100

# Plot the second graph
plt.figure(figsize=(8, 6))
bars = gender_asd_percentage.plot(kind='bar', color=['pink', 'blue'], alpha=0.7)
plt.title('Percentage of People with Class/ASD by Gender')
plt.xlabel('Gender')
plt.ylabel('Percentage')
plt.xticks(rotation=0)

# Add values on top of bars
for bar in bars.patches:
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.5, f'{bar.get_height():.1f}%', ha='center', va='bottom')
plt.show()
# Calculate counts of male and female in the data
gender_total = cleanTrain['gender'].value_counts()
gender_total.index = gender_total.index.map(gender_labels)

# Plot the third graph
plt.figure(figsize=(8, 6))
bars = gender_total.plot(kind='bar', color=['blue', 'pink'], alpha=0.7)
plt.title('Count of Male and Female in the Data')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.xticks(rotation=0)

# Add values on top of bars
for bar in bars.patches:
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.5, int(bar.get_height()), ha='center', va='bottom')
plt.show()


## 🩺 Jaundice History in ASD
An exploration of the `jaundice` feature among ASD-positive individuals shows a small proportion with reported neonatal jaundice.

In [None]:
import matplotlib.pyplot as plt

import os
original_dir = os.getcwd()
if os.path.basename(original_dir) == "DataCharts":
    os.chdir(os.path.dirname(original_dir))
%run dataInfo.ipynb

cleanTest = test_df
cleanTrain = train_df

# Filter only rows where person has ASD
asd_only = cleanTrain[cleanTrain['Class/ASD'] == 1]

# Count people with and without jaundice among ASD cases
jaundice_counts = asd_only['jaundice'].value_counts().sort_index()

# Map index to labels: 0 = No Jaundice, 1 = Jaundice
jaundice_counts.index = ['No Jaundice', 'Jaundice']

# Plot the chart
jaundice_counts.plot(kind='bar', color=['green', 'red'])
plt.title('Jaundice History in People with ASD')
plt.xlabel('Jaundice')
plt.ylabel('Number of People with ASD')
plt.grid(True, axis='y')
plt.show()

## 🌍 Ethnicity Distribution
The dataset spans multiple ethnic backgrounds with no single dominant group, helping ensure model generalizability.

In [None]:

import matplotlib.pyplot as plt

import os
original_dir = os.getcwd()
if os.path.basename(original_dir) == "DataCharts":
    os.chdir(os.path.dirname(original_dir))
%run dataInfo.ipynb

train_df = train_df[train_df['ethnicity'] != 0]

cleanTest = test_df
cleanTrain = train_df

# Amount of each ethnicity in the training data (using category labels)
ethnicity_counts = cleanTrain['ethnicity'].value_counts()

# Plot bar chart for ethnicity distribution
ethnicity_counts.plot(kind='bar', color=['skyblue', 'lightgreen', 'lightcoral', 'lightyellow', 'lightskyblue'])
plt.title('Ethnicity Distribution in Training Data')
plt.xlabel('Ethnicity')
plt.ylabel('Count')
plt.grid(True, axis='y')
plt.xticks(rotation=45)  # Rotate x-axis labels for better visibility
plt.show()

#Print the mapping
for i, eth in enumerate(ethnicity_categories):
    print(f"{eth} → {i}")

# Filter for individuals with ASD by ethnicity (using category labels)
ethnicities = cleanTrain['ethnicity'].unique()  # Get all unique ethnicity categories
asd_counts_by_ethnicity = {}

# Loop through each ethnicity and count the number of people with ASD
for ethnicity in ethnicities:
    ethnicity_asd = cleanTrain[(cleanTrain['ethnicity'] == ethnicity) & (cleanTrain['Class/ASD'] == 1)]
    asd_counts_by_ethnicity[ethnicity] = len(ethnicity_asd)

# Plot the bar chart for ASD by ethnicity (with category labels)
plt.bar(asd_counts_by_ethnicity.keys(), asd_counts_by_ethnicity.values(), color=['skyblue', 'lightgreen', 'lightcoral', 'lightyellow', 'lightskyblue'])
plt.title('ASD Distribution by Ethnicity in Training Data')
plt.ylabel('Count')
plt.xticks(rotation=45)  # Rotate x-axis labels for better visibility
plt.grid(True, axis='y')
plt.show()

# Calculate and print the percentage of ASD in each ethnicity (using category labels)
ethnicity_asd_percentage = {}
for ethnicity in ethnicities:
    total_ethnicity_count = len(cleanTrain[cleanTrain['ethnicity'] == ethnicity])
    asd_count = asd_counts_by_ethnicity.get(ethnicity, 0)
    ethnicity_asd_percentage[ethnicity] = (asd_count / total_ethnicity_count) * 100 if total_ethnicity_count != 0 else 0

# Print the percentages of ASD for each ethnicity
for ethnicity, percentage in ethnicity_asd_percentage.items():
    print(f"Percentage of {ethnicity} with ASD: {percentage:.2f}%")

for i, eth in enumerate(ethnicity_categories):
    print(f"{eth} → {i}")


## 📈 Questionnaire Score Correlations
The 10 behavioral screening questions (A1–A10) exhibit strong correlations with one another.

In [None]:


import os
original_dir = os.getcwd()
if os.path.basename(original_dir) == "DataCharts":
    os.chdir(os.path.dirname(original_dir))
%run dataInfo.ipynb
cleanTrain = train_df

import seaborn as sns
from scipy.stats import spearmanr
import matplotlib.pyplot as plt

# Select columns A1_Score through A10_Score
score_columns = ['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 
                 'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score']
scores = cleanTrain[score_columns]

# Compute Spearman correlation matrix
correlation_matrix = scores.corr(method='spearman')

# Plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", cbar=True)
plt.title("Correlation Heatmap (Spearman) for A1_Score through A10_Score")
plt.show()


## 🔍 Questionnaire Impact on ASD Classification
The individual scores show varying degrees of correlation with the final ASD label, with certain questions having a disproportionately high influence.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

import os
original_dir = os.getcwd()
if os.path.basename(original_dir) == "DataCharts":
    os.chdir(os.path.dirname(original_dir))
%run dataInfo.ipynb
cleanTrain = train_df

score_columns = ['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 
                 'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score']
scores = cleanTrain[score_columns]


correlation = cleanTrain[score_columns + ['Class/ASD']].corr()['Class/ASD'].drop('Class/ASD')
plt.figure(figsize=(10, 6))
ax = sns.barplot(x=correlation.index, y=correlation.values, hue=correlation.index, legend=False, palette='viridis')
plt.title('Impact of Questionnaires on Class/ASD')
plt.ylabel('Correlation with Class/ASD')
plt.xlabel('Questionnaire Score')
plt.xticks(rotation=45)

# Add percentage labels on top of the bars
for p in ax.patches:
    percentage = f'{p.get_height() * 100:.1f}%'
    ax.annotate(percentage, (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha='center', va='bottom', fontsize=10, color='black')

plt.show()