In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from tabulate import tabulate
from google.colab import files
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn import metrics
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression,LinearRegression
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,make_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_score,recall_score,f1_score
from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor

import warnings
warnings.filterwarnings('ignore')

In [None]:
df=pd.read_csv('/content/drive/MyDrive/Autism_Prediction/dataset2.csv')
print(df.head())

# bootstrap_sample = df.sample(n=500, replace=True)
# augmented_df = pd.concat([df, bootstrap_sample], ignore_index=True)

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
plt.pie(df['Class'].value_counts().values, autopct='%1.1f%%')
plt.show()

In [None]:
#Categorizing the columns in the dataset based on their datatype
ints = []
objects = []
floats = []

for col in df.columns:
    if df[col].dtype == int:
        ints.append(col)
    elif df[col].dtype == object:
        objects.append(col)
    else:
        floats.append(col)

In [None]:
ints.remove('Class')

In [None]:
#visulaizing how each categorical variable in the ints class affect the target
plt.subplots(figsize=(15, 15))
rows = len(ints) // 3 + 1  # Calculate the number of rows dynamically
cols = min(len(ints), 3)  # Set the number of columns to maximum 3
for i, col in enumerate(ints):
    plt.subplot(rows, cols, i+1)
    sb.countplot(data=df, x=col, hue='Class')
plt.tight_layout()
plt.show()

In [None]:


# Get the numerical columns and check for any skewness in the data
numerical_cols = df.select_dtypes(include=['int64']).columns

plt.subplots(figsize=(15,5))

for i, col in enumerate(numerical_cols):
  plt.subplot(1, len(numerical_cols), i+1)
  sb.distplot(df[col])
plt.tight_layout()
plt.show()

In [None]:
df['Age']=df['Age'].apply(lambda x:np.log(x))

In [None]:

# Get the numerical columns
numerical_cols = df.select_dtypes(include=['int64']).columns

plt.subplots(figsize=(15,5))

for i, col in enumerate(numerical_cols):
  plt.subplot(1, len(numerical_cols), i+1)
  sb.boxplot(df[col])
plt.tight_layout()
plt.show()

In [None]:
#Checking for highly correlated features to remove as they don't help in detecting useful patterns in the data:
def encode_labels(data):
    for col in data.columns:

      # Here we will check if datatype
      # is object then we will encode it
      if data[col].dtype == 'object':
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])

    return data

df = encode_labels(df)

# Making a heatmap to visualize the correlation matrix
plt.figure(figsize=(10,10))
sb.heatmap(df.corr() > 0.8, annot=True, cbar=False)
plt.show()

In [None]:
import numpy as np
from scipy.stats import pearsonr

# Assuming screening_score and class are columns in your DataFrame
screening_score = df['Screening Score']
class_variable = df['Class']

# Calculate Pearson correlation coefficient and p-value
correlation_coefficient, p_value = pearsonr(screening_score, class_variable)

print("Pearson correlation coefficient:", correlation_coefficient)
print("P-value:", p_value)

In [None]:
#removing unnecessary or harmful data and determining the labels and target:
removal=['Screening Score','Who is completing the test','Region','Family member with ASD history']
features=df.drop(removal+['Class'],axis=1)
target=df['Class']

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(features, target, test_size = 0.3, random_state=10)

# Since the data was highly imbalanced we will balance it by  adding repitive rows of minority classes
smote = SMOTE(random_state=42)
X_train_resampled, Y_train_resampled = smote.fit_resample(X_train, Y_train)


In [None]:
#Normalizing the data for stable and fast training:
scaler = StandardScaler()

# Fit the scaler on the training data
scaler.fit(X_train_resampled)

# Transform both the training and test data
X_train_resampled = scaler.transform(X_train_resampled)
X_test = scaler.transform(X_test)

In [None]:
dt=DecisionTreeClassifier(max_depth=3,min_samples_leaf=10,random_state=1, criterion='entropy')
rf=RandomForestClassifier(n_estimators=100,max_depth=10,max_features='sqrt')
gb=GradientBoostingClassifier(n_estimators=100,learning_rate=0.1,max_depth=3,random_state=42)

In [None]:
models = {"SVM ":SVC(kernel='rbf'),"Gradient Boosting" :gb,"KNN": KNeighborsClassifier(),"Random Forest": rf, "Decision Tree": dt, "Naive Bayes": GaussianNB()}
metrics = [
    {'name': 'accuracy', 'metric': make_scorer(accuracy_score)},
    {'name': 'precision', 'metric': make_scorer(precision_score)},
    {'name': 'recall', 'metric': make_scorer(recall_score)},
    {'name': 'f1_score', 'metric': make_scorer(f1_score)}
]
results_table=[]
for i in range(len(list(models))):
  model=list(models.values())[i]
  model.fit(X_train_resampled,Y_train_resampled) #Training the model

  #Making predictions
  Y_train_pred=model.predict(X_train_resampled)
  Y_test_pred=model.predict(X_test)

  #Testing set performance
  model_test_accuracy =accuracy_score(Y_test,Y_test_pred)
  model_test_f1=f1_score(Y_test,Y_test_pred,average='weighted')
  model_test_precision=precision_score(Y_test,Y_test_pred)
  model_test_recall=recall_score(Y_test,Y_test_pred)

  print(list(models.keys())[i])

  #Testing set results
  print('Model performance for testing set')
  print('-Accuracy: {:.4f}'.format(model_test_accuracy))
  print('-Precision: {:.4f}'.format(model_test_precision))
  print('-recall: {:.4f}'.format(model_test_recall))
  print('-f1_score: {:.4f}'.format(model_test_f1))
  print("$$$$$$$$$$$$$$$$$$$")
  results_table.append([list(models.keys())[i], model_test_accuracy, model_test_precision, model_test_recall,
                          model_test_f1])

  print('\n ')


In [None]:

# Extracting data for plotting
model_names = [result[0] for result in results_table]
accuracy_scores = [result[1] for result in results_table]
precision_scores = [result[2] for result in results_table]
recall_scores = [result[3] for result in results_table]
f1_scores = [result[4] for result in results_table]

# Creating a bar chart
fig, ax = plt.subplots(figsize=(10, 6))

bar_width = 0.2
index = np.arange(len(model_names))

bar1 = ax.bar(index, accuracy_scores, bar_width, label='Accuracy')
bar2 = ax.bar(index + bar_width, precision_scores, bar_width, label='Precision')
bar3 = ax.bar(index + 2 * bar_width, recall_scores, bar_width, label='Recall')
bar4 = ax.bar(index + 3 * bar_width, f1_scores, bar_width, label='F1 Score')

# Adding labels
ax.set_xlabel('Models')
ax.set_ylabel('Scores')
ax.set_title('Performance Comparison of Models')
ax.set_xticks(index + 1.5 * bar_width)
ax.set_xticklabels(model_names)
ax.legend()

# Display the chart
plt.savefig('performance_chart2.png')
files.download('performance_chart2.png')
plt.show()








In [None]:
# Define the parameter grid for each model
param_grid_dt = {'max_depth': [None, 10, 20, 30, 40, 50]}
param_grid_rf = {'n_estimators': [50, 100, 200], 'max_depth': [10, 20, 30]}
param_grid_svm = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': ['scale', 'auto']}
param_grid_nb = {}  # No hyperparameters for Naive Bayes
param_grid_knn = {'n_neighbors': [3, 5, 7, 9]}
param_grid_gb={'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5]}

# Create a dictionary of models and their respective parameter grids
models_grid = {
    "SVM": (SVC(kernel='rbf'), param_grid_svm),
    "Gradient Boosting": (gb,param_grid_gb),
    "KNN": (KNeighborsClassifier(), param_grid_knn),
    "Random Forest": (rf, param_grid_rf),
    "Decision Tree": (dt, param_grid_dt),
    "Naive Bayes": (GaussianNB(), param_grid_nb)
}

metrics = [
    {'name': 'accuracy', 'metric': make_scorer(accuracy_score)},
    {'name': 'precision', 'metric': make_scorer(precision_score)},
    {'name': 'recall', 'metric': make_scorer(recall_score)},
    {'name': 'f1_score', 'metric': make_scorer(f1_score)}
]

#Initializing lists to store the results to represent in a chart
cv_model_names = [model_name for model_name, _ in models_grid.items()]
cv_model_names = []
cv_accuracy_scores = []
cv_precision_scores = []
cv_recall_scores = []
cv_f1_scores = []
results=[]
# Perform Grid Search with Cross-Validation
for model_name, (model, param_grid) in models_grid.items():
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train_resampled, Y_train_resampled)

    # Get the best parameters and the best model
    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_

    print(f"Best parameters for {model_name}: {best_params}")

    # Evaluate the best model on the test set
    Y_test_pred = best_model.predict(X_test)
    test_accuracy = accuracy_score(Y_test, Y_test_pred)
    test_precision = precision_score(Y_test, Y_test_pred)
    test_f1 = f1_score(Y_test, Y_test_pred)
    test_recall = recall_score(Y_test, Y_test_pred)

    print("Performance on the test set:")
    print('- Accuracy: {:.4f}'.format(test_accuracy))
    print('- Precision: {:.4f}'.format(test_precision))
    print('- Recall: {:.4f}'.format(test_recall))
    print('- F1 Score: {:.4f}'.format(test_f1))
    print("-------------------------------")
    print('\n')


    for metric in metrics:
      model,_=models_grid[model_name]
      scores = cross_val_score(model, X_train_resampled, Y_train_resampled, cv=10, scoring=metric['metric'])
      print('Cross-validation ' + metric['name'] + ' average scores for ' + model.__class__.__name__ + ':', scores.mean())
      #print('Average cross-validation score: {:.4f}'.format(scores.mean()))
      print('----------------------------------------')

    print('\n ')

In [None]:
# Extracting data for plotting
model_names = [result[0] for result in results_table]
accuracy_scores = [result[1] for result in results_table]
precision_scores = [result[2] for result in results_table]
recall_scores = [result[3] for result in results_table]
f1_scores = [result[4] for result in results_table]

# Creating a bar chart
fig, ax = plt.subplots(figsize=(10, 6))

bar_width = 0.2
index = np.arange(len(model_names))

bar1 = ax.bar(index, accuracy_scores, bar_width, label='Accuracy')
bar2 = ax.bar(index + bar_width, precision_scores, bar_width, label='Precision')
bar3 = ax.bar(index + 2 * bar_width, recall_scores, bar_width, label='Recall')
bar4 = ax.bar(index + 3 * bar_width, f1_scores, bar_width, label='F1 Score')

# Adding labels
ax.set_xlabel('Models')
ax.set_ylabel('Scores')
ax.set_title('Performance Comparison of Models')
ax.set_xticks(index + 1.5 * bar_width)
ax.set_xticklabels(model_names)
ax.legend()

# Display the chart
plt.savefig('performance_chart_grid_search2.png')
files.download('performance_chart_grid_search2.png')
plt.show()


In [None]:
# Before SMOTE
plt.figure(figsize=(8, 8))
plt.pie(Y_train.value_counts(), labels=Y_train.value_counts().index, autopct='%1.1f%%', startangle=90)
plt.title('Class Distribution Before SMOTE')
plt.show()

# After SMOTE
plt.figure(figsize=(8, 8))
plt.pie(pd.Series(Y_train_resampled).value_counts(), labels=pd.Series(Y_train_resampled).value_counts().index, autopct='%1.1f%%', startangle=90)
plt.title('Class Distribution After SMOTE')
plt.show()