In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input/autism-screening-for-toddlers'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

COLLECTING DATA

In [None]:
import pandas as pd

df = pd.read_csv('/kaggle/input/autism-screening-for-toddlers/Autism_Screening_Data_Combined.csv')
df.head()


In [None]:
import pandas as pd

df = pd.read_csv('/kaggle/input/autism-screening-for-toddlers/Toddler Autism dataset July 2018.csv')
df.head()

CLEANING DATA

In [None]:
# See how many missing values are in each column
df.isnull().sum()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10,6))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
plt.title("Missing values heatmap")
plt.show()


DUPLICATE

In [None]:
df.duplicated().sum()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 6))
sns.boxplot(x=df['Age_Mons'])  # Replace with any numeric column
plt.title("Boxplot for Age_Mons")
plt.show()


OUTLINERS

In [None]:
# Choose a numeric column
col = 'Age_Mons'

# Calculate Q1, Q3 and IQR
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1

# Define bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out outliers
outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
print(f"Outliers in '{col}': {len(outliers)} rows")


DATA ENCODING

In [None]:
df['Jaundice'] = df['Jaundice'].map({'yes': 1, 'no': 0})
df['Family_mem_with_ASD'] = df['Family_mem_with_ASD'].map({'yes': 1, 'no': 0})
df['Sex'] = df['Sex'].map({'Male': 1, 'Female': 0})


In [None]:
print(df.columns.tolist())


In [None]:
df.columns = df.columns.str.strip().str.replace('\n', '')


In [None]:
print(df.columns.tolist())


In [None]:
df['Class/ASD Traits'] = df['Class/ASD Traits'].astype(str).str.strip().str.lower()
df['Class/ASD Traits'] = df['Class/ASD Traits'].map({'yes': 1, 'no': 0})


In [None]:
print(df['Class/ASD Traits'].unique())


FEATURE SCALING

In [None]:
from sklearn.preprocessing import StandardScaler


In [None]:
features_to_scale = ['Age_Mons', 'Qchat-10-Score']


In [None]:
scaler = StandardScaler()
df[features_to_scale] = scaler.fit_transform(df[features_to_scale])


In [None]:
print(df[features_to_scale].describe())


Exploratory Data Analysis (EDA)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
print(df.shape)         # Rows and columns
print(df.info())        # Data types and null values
print(df.describe())    # Statistical summary for numerical columns
print(df.head())        # Preview first few rows


In [None]:
sns.countplot(x='Class/ASD Traits', data=df)
plt.title("Target Class Distribution")
plt.show()


In [None]:
categorical_cols = ['Sex', 'Ethnicity', 'Jaundice', 'Family_mem_with_ASD']

for col in categorical_cols:
    print(f"\nUnique values in '{col}':")
    print(df[col].unique())


In [None]:
numeric_cols = ['Age_Mons', 'Qchat-10-Score']

for col in numeric_cols:
    plt.figure(figsize=(6,4))
    sns.histplot(df[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.show()


In [None]:
for col in numeric_cols:
    plt.figure(figsize=(6,4))
    sns.boxplot(x=df[col])
    plt.title(f'Boxplot of {col}')
    plt.show()


In [None]:
plt.figure(figsize=(10, 8))
numeric_df = df.select_dtypes(include=['number'])  # Select only numeric columns
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix (Numeric Features Only)')
plt.show()


In [None]:
numeric_df = df.select_dtypes(include=['number'])
print(numeric_df.isnull().sum())


In [None]:
# Step 1: Drop non-numeric columns that got misclassified (like 'Sex')
numeric_df = df[['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'Age_Mons', 'Qchat-10-Score']]

# Step 2: Generate the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix (Cleaned Numeric Features)')
plt.show()


Draw conclusions from visualization

In [None]:
sns.countplot(x='Class/ASD Traits', data=df)


In [None]:
sns.boxplot(x='Class/ASD Traits', y='Qchat-10-Score', data=df)


Feature Engineering:

In [None]:
import pandas as pd

df = pd.read_csv('/kaggle/input/autism-screening-for-toddlers/Autism_Screening_Data_Combined.csv')  # replace with your actual file name


In [None]:
print(df.columns.tolist())


In [None]:
X = df.drop('Class', axis=1)
y = df['Class'].apply(lambda x: 1 if x == 'YES' else 0)  # Encoding target


DATA SPLITTING

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Assuming your DataFrame is called df
# Step 1: Fix column names if necessary
df.columns = df.columns.str.strip().str.replace(" ", "_").str.replace("/", "_")

# Step 2: Rename target column for consistency
df.rename(columns={'Class/ASD_Traits': 'Class'}, inplace=True)

# Step 3: Drop rows with missing values (or you could impute them)
df.dropna(inplace=True)

# Step 4: Define features (X) and label (y)
X = df.drop('Class', axis=1)
y = df['Class'].apply(lambda x: 1 if x == 'YES' else 0)  # Convert target to numeric

# Step 5: One-hot encode categorical features
X_encoded = pd.get_dummies(X, drop_first=True)

# Step 6: Scale features (optional but recommended)
scaler = StandardScaler()
X_preprocessed = scaler.fit_transform(X_encoded)

# Step 7: Split the data
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report


In [None]:
# Logistic Regression
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

# Random Forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

# Support Vector Machine
svm_model = SVC()
svm_model.fit(X_train, y_train)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Logistic Regression
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

# Random Forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# Support Vector Machine
svm_model = SVC()
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)


In [None]:
print("Logistic Regression:\n", classification_report(y_test, y_pred_lr))
print("Random Forest:\n", classification_report(y_test, y_pred_rf))
print("SVM:\n", classification_report(y_test, y_pred_svm))


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Example for Random Forest
cm = confusion_matrix(y_test, y_pred_rf)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Random Forest Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


XAI

In [None]:
import shap

# Create a SHAP explainer object
explainer = shap.Explainer(lr_model, X_train)  # Use your model and training data

# Compute SHAP values for the testing set
shap_values = explainer.shap_values(X_test)

# Visualize SHAP values
shap.summary_plot(shap_values, X_test)


MODEL SELECTION AND TRANING

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize candidate models
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42)
}

# Initialize a dictionary to store the results
results = {}

# Train each model and evaluate
for name, model in models.items():
    # Fit the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Store the results (accuracy and classification report)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = {
        'accuracy': accuracy,
        'classification_report': classification_report(y_test, y_pred)
    }

# Print the results for each model
for name, result in results.items():
    print(f"Model: {name}")
    print(f"Accuracy: {result['accuracy']}")
    print(f"Classification Report:\n{result['classification_report']}")
    print("="*50)


MODEL EVALUATION

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Assuming y_pred and y_test are already defined
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# For ROC AUC, we need probability scores for binary classification
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Display evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
print("ROC AUC score:", roc_auc)
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

# Visualize the confusion matrix
disp = ConfusionMatrixDisplay(conf_matrix)
disp.plot(cmap=plt.cm.Blues)
plt.show()


In [None]:
import joblib
joblib.dump(model, 'autism_model.pkl')  # Save model to file


In [None]:
import os
import shutil

# Create the destination folder if it doesn't exist
destination_folder = '/kaggle/working/another_folder'
os.makedirs(destination_folder, exist_ok=True)

# Copy the model file to the new folder
shutil.copy('autism_model.pkl', os.path.join(destination_folder, 'autism_model.pkl'))


In [None]:
from flask import Flask, request, jsonify
import joblib
import numpy as np

app = Flask(__name__)
model = joblib.load('autism_model.pkl')

@app.route('/predict', methods=['POST'])
def predict():
    data = request.get_json()
    prediction = model.predict([[
        data['Age'], data['Sex'], data['Jaundice'], data['Family_ASD']
    ]])
    return jsonify({'prediction': int(prediction[0])})

app.run()


In [None]:
from sklearn.metrics import accuracy_score

# Step 1: Make predictions on the test set
y_pred = model.predict(X_test)

# Step 2: Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Step 3: Print the accuracy
print(f"✅ Model Accuracy on Test Data: {accuracy * 100:.2f}%")


In [None]:
# Step 3: Drop irrelevant columns (if any exist)
df = df.drop(columns=['Who completed the test', 'Case_No'], errors='ignore')

# Step 4: Fix column typo
df.rename(columns={'Jauundice': 'Jaundice'}, inplace=True)  # Correct the spelling

# Step 5: Fill missing values
df['Age'] = df['Age'].fillna(df['Age'].mean())
for col in ['Sex', 'Jaundice', 'Family_ASD']:
    if df[col].isnull().any():
        mode_value = df[col].mode().dropna()
        if not mode_value.empty:
            df[col] = df[col].fillna(mode_value[0])


In [None]:
print(df.columns.tolist())


In [None]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import joblib
import shutil

# Step 2: Load the dataset
df = pd.read_csv('/kaggle/input/autism-screening-for-toddlers/Autism_Screening_Data_Combined.csv')

# Step 3: Drop irrelevant columns (if any exist)
df = df.drop(columns=['Who completed the test', 'Case_No'], errors='ignore')

# Step 4: Fill missing values
df['Age'] = df['Age'].fillna(df['Age'].mean())
for col in ['Sex', 'Jauundice', 'Family_ASD']:
    if df[col].isnull().any():
        mode_value = df[col].mode().dropna()
        if not mode_value.empty:
            df[col] = df[col].fillna(mode_value[0])

# Step 5: Encode categorical columns
label_enc = LabelEncoder()
for col in ['Sex', 'Jauundice', 'Family_ASD']:
    df[col] = label_enc.fit_transform(df[col])

# Step 6: Prepare features and labels
X = df.drop(columns=['Class'])
y = df['Class'].apply(lambda x: 1 if x == 'YES' else 0)  # Binary encoding

# Step 7: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 8: Train model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Step 9: Save the model
joblib.dump(model, 'autism_model.pkl')
joblib.dump(model, 'autism_model.pkl')


print("✅ Model trained and saved as 'autism_model.pkl'. You can now download it from the right side panel.")
