In [2]:
!pip install -q nltk spacy scikit-learn pandas numpy matplotlib

In [3]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m75.9 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [4]:
import pandas as pd
import numpy as np
import re
import nltk
import spacy

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
aq_text_map = {
    "A1_Score": "my child avoids eye contact",
    "A2_Score": "my child shows repetitive behavior",
    "A3_Score": "my child has difficulty in social interaction",
    "A4_Score": "my child struggles to understand social cues",
    "A5_Score": "my child prefers to be alone",
    "A6_Score": "my child has delayed communication skills",
    "A7_Score": "my child repeats words or phrases",
    "A8_Score": "my child finds it hard to adapt to changes",
    "A9_Score": "my child has limited imaginative play",
    "A10_Score": "my child does not respond when called"
}

In [6]:
!ls

sample_data


In [8]:
from scipy.io import arff

# Load the .arff file
data_arff, meta = arff.loadarff('Autism-Child-Data.arff')

# Convert to pandas DataFrame
df = pd.DataFrame(data_arff)

# Decode byte string columns to UTF-8 strings
for column in df.columns:
    if df[column].dtype == 'object':
        df[column] = df[column].apply(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

print("DataFrame loaded and byte strings decoded successfully. First 5 rows:")
print(df.head())

DataFrame loaded and byte strings decoded successfully. First 5 rows:
  A1_Score A2_Score A3_Score A4_Score A5_Score A6_Score A7_Score A8_Score  \
0        1        1        0        0        1        1        0        1   
1        1        1        0        0        1        1        0        1   
2        1        1        0        0        0        1        1        1   
3        0        1        0        0        1        1        0        0   
4        1        1        1        1        1        1        1        1   

  A9_Score A10_Score  ...  gender        ethnicity jundice austim  \
0        0         0  ...       m           Others      no     no   
1        0         0  ...       m  Middle Eastern       no     no   
2        0         0  ...       m                ?      no     no   
3        0         1  ...       f                ?     yes     no   
4        1         1  ...       m           Others     yes     no   

   contry_of_res used_app_before result    age_desc 

In [9]:
aq_score_cols = [f'A{i}_Score' for i in range(1, 11)]
df[aq_score_cols] = df[aq_score_cols].astype(int)

print("Data types of A_Score columns after conversion:")
print(df[aq_score_cols].dtypes)

Data types of A_Score columns after conversion:
A1_Score     int64
A2_Score     int64
A3_Score     int64
A4_Score     int64
A5_Score     int64
A6_Score     int64
A7_Score     int64
A8_Score     int64
A9_Score     int64
A10_Score    int64
dtype: object


In [10]:
parent_dialogues = []

for index, row in df.iterrows():
    current_dialogue_parts = []
    for col_name in aq_score_cols:
        if row[col_name] == 1:
            current_dialogue_parts.append(aq_text_map[col_name])
    parent_dialogues.append(". ".join(current_dialogue_parts))

df['parent_dialogue'] = parent_dialogues

print("DataFrame with 'parent_dialogue' column added. First 5 rows:")
print(df[['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'parent_dialogue']].head())

DataFrame with 'parent_dialogue' column added. First 5 rows:
   A1_Score  A2_Score  A3_Score  A4_Score  A5_Score  \
0         1         1         0         0         1   
1         1         1         0         0         1   
2         1         1         0         0         0   
3         0         1         0         0         1   
4         1         1         1         1         1   

                                     parent_dialogue  
0  my child avoids eye contact. my child shows re...  
1  my child avoids eye contact. my child shows re...  
2  my child avoids eye contact. my child shows re...  
3  my child shows repetitive behavior. my child p...  
4  my child avoids eye contact. my child shows re...  


In [11]:
df_selected = df[['parent_dialogue', 'Class/ASD']].copy()

print("DataFrame with only 'parent_dialogue' and 'Class/ASD' columns. First 5 rows:")
print(df_selected.head())

DataFrame with only 'parent_dialogue' and 'Class/ASD' columns. First 5 rows:
                                     parent_dialogue Class/ASD
0  my child avoids eye contact. my child shows re...        NO
1  my child avoids eye contact. my child shows re...        NO
2  my child avoids eye contact. my child shows re...        NO
3  my child shows repetitive behavior. my child p...        NO
4  my child avoids eye contact. my child shows re...       YES


In [12]:
df_selected['Class/ASD'] = df_selected['Class/ASD'].map({'YES': 1, 'NO': 0})

print("DataFrame with 'Class/ASD' converted to numerical (1 for YES, 0 for NO). First 5 rows:")
print(df_selected.head())

DataFrame with 'Class/ASD' converted to numerical (1 for YES, 0 for NO). First 5 rows:
                                     parent_dialogue  Class/ASD
0  my child avoids eye contact. my child shows re...          0
1  my child avoids eye contact. my child shows re...          0
2  my child avoids eye contact. my child shows re...          0
3  my child shows repetitive behavior. my child p...          0
4  my child avoids eye contact. my child shows re...          1


In [13]:
df_ml = df.rename(columns={"Class/ASD": "label"})

In [14]:
df_ml = df_ml[df_ml["parent_dialogue"].str.strip() != ""]

In [15]:
df_ml.shape

(291, 22)

In [16]:
X_text = df_ml["parent_dialogue"]
y = df_ml["label"]

In [17]:
y.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
NO,150
YES,141


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_features=3000,
    ngram_range=(1, 2)
)

X = vectorizer.fit_transform(X_text)

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [21]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report

svm = LinearSVC()
svm.fit(X_train, y_train)

y_pred = svm.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred)*100)
print(classification_report(y_test, y_pred))

Accuracy: 89.83050847457628
              precision    recall  f1-score   support

          NO       0.93      0.87      0.90        30
         YES       0.87      0.93      0.90        29

    accuracy                           0.90        59
   macro avg       0.90      0.90      0.90        59
weighted avg       0.90      0.90      0.90        59



In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

pred_lr = lr.predict(X_test)

print("Logistic Regression Accuracy:", accuracy_score(y_test, pred_lr)*100)
print(classification_report(y_test, pred_lr))

Logistic Regression Accuracy: 84.7457627118644
              precision    recall  f1-score   support

          NO       0.89      0.80      0.84        30
         YES       0.81      0.90      0.85        29

    accuracy                           0.85        59
   macro avg       0.85      0.85      0.85        59
weighted avg       0.85      0.85      0.85        59



In [24]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5, metric="cosine")
knn.fit(X_train, y_train)

pred_knn = knn.predict(X_test)

print("KNN Accuracy:", accuracy_score(y_test, pred_knn)*100)
print(classification_report(y_test, pred_knn))

KNN Accuracy: 83.05084745762711
              precision    recall  f1-score   support

          NO       1.00      0.67      0.80        30
         YES       0.74      1.00      0.85        29

    accuracy                           0.83        59
   macro avg       0.87      0.83      0.83        59
weighted avg       0.87      0.83      0.83        59



In [25]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    class_weight="balanced"
)

rf.fit(X_train, y_train)

pred_rf = rf.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, pred_rf)*100)
print(classification_report(y_test, pred_rf))

Random Forest Accuracy: 100.0
              precision    recall  f1-score   support

          NO       1.00      1.00      1.00        30
         YES       1.00      1.00      1.00        29

    accuracy                           1.00        59
   macro avg       1.00      1.00      1.00        59
weighted avg       1.00      1.00      1.00        59



In [27]:
from scipy.io import arff

# Load the .arff file
data_adolescent_arff, meta_adolescent = arff.loadarff('Autism-Adolescent-Data.arff')

# Convert to pandas DataFrame
df_adolescent = pd.DataFrame(data_adolescent_arff)

# Decode byte string columns to UTF-8 strings
for column in df_adolescent.columns:
    if df_adolescent[column].dtype == 'object':
        df_adolescent[column] = df_adolescent[column].apply(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

print("DataFrame loaded and byte strings decoded successfully. First 5 rows:")
print(df_adolescent.head())

DataFrame loaded and byte strings decoded successfully. First 5 rows:
  A1_Score A2_Score A3_Score A4_Score A5_Score A6_Score A7_Score A8_Score  \
0        0        0        0        1        1        1        1        1   
1        0        0        0        0        0        0        0        0   
2        0        0        0        0        0        0        0        0   
3        0        1        1        1        1        1        0        1   
4        1        1        1        1        1        1        1        0   

  A9_Score A10_Score  ...  gender       ethnicity jundice austim  \
0        1         0  ...       m        Hispanic     yes    yes   
1        1         1  ...       m           Black      no     no   
2        1         1  ...       f               ?      no     no   
3        1         0  ...       f  White-European      no     no   
4        0         0  ...       f               ?      no     no   

    contry_of_res used_app_before result     age_desc  rel

In [28]:
aq_score_cols = [f'A{i}_Score' for i in range(1, 11)]
df_adolescent[aq_score_cols] = df_adolescent[aq_score_cols].astype(int)

adolescent_dialogues = []

for index, row in df_adolescent.iterrows():
    current_dialogue_parts = []
    for col_name in aq_score_cols:
        if row[col_name] == 1:
            current_dialogue_parts.append(aq_text_map[col_name])
    adolescent_dialogues.append(". ".join(current_dialogue_parts))

df_adolescent['parent_dialogue'] = adolescent_dialogues

df_adolescent = df_adolescent.rename(columns={"Class/ASD": "label"})
df_adolescent['label'] = df_adolescent['label'].map({'YES': 1, 'NO': 0})

df_adolescent = df_adolescent[df_adolescent["parent_dialogue"].str.strip() != ""]

print("Processed df_adolescent DataFrame. First 5 rows:")
print(df_adolescent[['parent_dialogue', 'label']].head())

Processed df_adolescent DataFrame. First 5 rows:
                                     parent_dialogue  label
0  my child struggles to understand social cues. ...      0
1  my child has limited imaginative play. my chil...      0
2  my child has limited imaginative play. my chil...      0
3  my child shows repetitive behavior. my child h...      1
4  my child avoids eye contact. my child shows re...      1


## Combine Datasets

### Subtask:
Concatenate the processed child DataFrame (`df_ml`) and the newly processed adolescent DataFrame into a single combined DataFrame.


**Reasoning**:
To combine the processed child and adolescent dataframes, I will use `pd.concat` to vertically stack `df_ml` and `df_adolescent` and then reset the index of the resulting combined dataframe.



In [29]:
df_combined = pd.concat([df_ml, df_adolescent], ignore_index=True)

print("Combined DataFrame created. First 5 rows:")
print(df_combined.head())
print("\nShape of combined DataFrame:", df_combined.shape)

Combined DataFrame created. First 5 rows:
   A1_Score  A2_Score  A3_Score  A4_Score  A5_Score  A6_Score  A7_Score  \
0         1         1         0         0         1         1         0   
1         1         1         0         0         1         1         0   
2         1         1         0         0         0         1         1   
3         0         1         0         0         1         1         0   
4         1         1         1         1         1         1         1   

   A8_Score  A9_Score  A10_Score  ...        ethnicity jundice austim  \
0         1         0          0  ...           Others      no     no   
1         1         0          0  ...  Middle Eastern       no     no   
2         1         0          0  ...                ?      no     no   
3         0         0          1  ...                ?     yes     no   
4         1         1          1  ...           Others     yes     no   

   contry_of_res used_app_before result    age_desc  relation label 

## Prepare Combined Data for ML

### Subtask:
Extract the 'parent_dialogue' as X_text and 'label' as y from the combined DataFrame, then perform TF-IDF vectorization on X_text, and finally split the data into training and testing sets (X_train, X_test, y_train, y_test).


**Reasoning**:
I will extract the 'parent_dialogue' and 'label' columns, perform TF-IDF vectorization on the 'parent_dialogue', and then split the data into training and testing sets as per the instructions.



In [33]:
X_text = df_combined["parent_dialogue"]
y = df_combined["label"].map({'YES': 1, 'NO': 0, 1: 1, 0: 0}).astype(int)

vectorizer = TfidfVectorizer(
    max_features=3000,
    ngram_range=(1, 2)
)
X = vectorizer.fit_transform(X_text)

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Data preparation complete.")
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

Data preparation complete.
X_train shape: (316, 92), y_train shape: (316,)
X_test shape: (79, 92), y_test shape: (79,)


In [34]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# LinearSVC Model
print("Training LinearSVC Model...")
svm = LinearSVC(random_state=42)
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
print("LinearSVC Accuracy:", accuracy_score(y_test, y_pred_svm)*100)
print(classification_report(y_test, y_pred_svm))

# Logistic Regression Model
print("\nTraining Logistic Regression Model...")
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train, y_train)
pred_lr = lr.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, pred_lr)*100)
print(classification_report(y_test, pred_lr))

# KNeighborsClassifier Model
print("\nTraining KNeighborsClassifier Model...")
knn = KNeighborsClassifier(n_neighbors=5, metric="cosine")
knn.fit(X_train, y_train)
pred_knn = knn.predict(X_test)
print("KNN Accuracy:", accuracy_score(y_test, pred_knn)*100)
print(classification_report(y_test, pred_knn))

# RandomForestClassifier Model
print("\nTraining RandomForestClassifier Model...")
rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    class_weight="balanced"
)
rf.fit(X_train, y_train)
pred_rf = rf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, pred_rf)*100)
print(classification_report(y_test, pred_rf))

Training LinearSVC Model...
LinearSVC Accuracy: 91.13924050632912
              precision    recall  f1-score   support

           0       1.00      0.82      0.90        38
           1       0.85      1.00      0.92        41

    accuracy                           0.91        79
   macro avg       0.93      0.91      0.91        79
weighted avg       0.92      0.91      0.91        79


Training Logistic Regression Model...
Logistic Regression Accuracy: 84.81012658227847
              precision    recall  f1-score   support

           0       0.93      0.74      0.82        38
           1       0.80      0.95      0.87        41

    accuracy                           0.85        79
   macro avg       0.86      0.84      0.85        79
weighted avg       0.86      0.85      0.85        79


Training KNeighborsClassifier Model...
KNN Accuracy: 81.0126582278481
              precision    recall  f1-score   support

           0       0.93      0.66      0.77        38
           1 

In [35]:
from xgboost import XGBClassifier

print("\nTraining XGBoost Classifier Model...")
xgb = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42
)
xgb.fit(X_train, y_train)

pred_xgb = xgb.predict(X_test)

print("XGBoost Accuracy:", accuracy_score(y_test, pred_xgb)*100)
print(classification_report(y_test, pred_xgb))


Training XGBoost Classifier Model...
XGBoost Accuracy: 100.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        38
           1       1.00      1.00      1.00        41

    accuracy                           1.00        79
   macro avg       1.00      1.00      1.00        79
weighted avg       1.00      1.00      1.00        79



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
