In [15]:
import pandas as pd 
import numpy as np
import xgboost as xgb

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE


In [16]:
# Importing the dataset 
data = pd.read_csv("C:/Users/Admin/Desktop/Neruodivergent project/Final_data.csv")

In [17]:
data

Unnamed: 0,How old are you?,How likely are you to use the presented functionality?,What changes do you think that the functionality would bring in your video calling experience? (Select many if apply),Do you think that the functionality would be socially accepted if used by neurodiverse individuals?,Do you think that the functionality would be socially acceptable if used by people who aren't neurodiverse?,What assistive technologies do you use?,How knowledgeable are you about artificial intelligence?,How knowledgeable are you about deepfakes?,How knowledgeable are you with generative artificial intelligence?,What kind of neurodevelopmental disorder/s do you have?
0,21,No,My desire for looking at the camera would decr...,"Yes, if video call attendees were aware of fun...","Yes, if video call attendees were aware of fun...","Planning tools (such as written planners, colo...",8,6,10,ND
1,28,No,My desire for looking at the camera would decr...,"Yes, anytime","Yes, anytime",Sensory regulation tools (such as stress balls...,8,7,9,ND
2,Unknown,No,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,ND
3,Unknown,Yes,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,ND
4,29,Yes,Unknown,"Yes, if video call attendees were aware of fun...","Yes, if video call attendees were aware of fun...",Speech to text applications (such as Google do...,5,5,5,ND
...,...,...,...,...,...,...,...,...,...,...
188,39,No,My desire for looking at the camera would decr...,"Yes, anytime","Yes, anytime",Don't use any,6,6,7,ND
189,22,No,I would be staring at myself in the video call...,"Yes, anytime","Yes, anytime",Unknown,9,9,9,NT
190,Unknown,No,My desire for looking at the camera would decr...,"Yes, anytime","Yes, if video call attendees were aware of fun...",Unknown,7,8,6,NT
191,Unknown,No,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,ND


In [18]:
# Encoding categorical values to numercial values
label_encoder = LabelEncoder()

In [19]:
# Using the questions as inputs
questions = ['How old are you?',
       'How likely are you to use the presented functionality?',
       'What changes do you think that the functionality would bring in your video calling experience? (Select many if apply)',
       'Do you think that the functionality would be socially accepted if used by neurodiverse individuals?',
       "Do you think that the functionality would be socially acceptable if used by people who aren't neurodiverse?",
       'What assistive technologies do you use? ',
       'How knowledgeable are you about artificial intelligence?',
       'How knowledgeable are you about deepfakes?',
       'How knowledgeable are you with generative artificial intelligence?',
       'What kind of neurodevelopmental disorder/s do you have? ']

In [20]:
for col in questions:
    data[col] = label_encoder.fit_transform(data[col])

In [21]:
# splitting the neurodivergent data into features (X) and target (y) sets
X = data.drop('How likely are you to use the presented functionality?', axis = 1)
y = data['How likely are you to use the presented functionality?']

In [22]:
# splitting the data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [30]:
# Create an XGBoost classifier
clf = xgb.XGBClassifier(
    objective='binary:logistic',  # Binary classification task
    eval_metric='logloss',       # Evaluation metric (logarithmic loss)
    max_depth=3,                 # Maximum depth of trees (adjust as needed)
    n_estimators=100,            # Number of boosting rounds
    learning_rate=0.1            # Learning rate (adjust as needed)
)

In [31]:
# Define a grid of hyperparameters for GridSearchCV
param_grid = {
    'max_depth': [3, 4, 5],        # Maximum depth of trees
    'n_estimators': [100, 200, 300],  # Number of boosting rounds
    'learning_rate': [0.1, 0.2, 0.3]  # Learning rate
}


In [32]:
# Perform hyperparameter tuning using GridSearchCV
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=3, verbose=2)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV] END ...learning_rate=0.1, max_depth=3, n_estimators=100; total time=   0.0s
[CV] END ...learning_rate=0.1, max_depth=3, n_estimators=100; total time=   0.0s
[CV] END ...learning_rate=0.1, max_depth=3, n_estimators=100; total time=   0.0s
[CV] END ...learning_rate=0.1, max_depth=3, n_estimators=200; total time=   0.0s
[CV] END ...learning_rate=0.1, max_depth=3, n_estimators=200; total time=   0.0s
[CV] END ...learning_rate=0.1, max_depth=3, n_estimators=200; total time=   0.0s
[CV] END ...learning_rate=0.1, max_depth=3, n_estimators=300; total time=   0.0s
[CV] END ...learning_rate=0.1, max_depth=3, n_estimators=300; total time=   0.0s
[CV] END ...learning_rate=0.1, max_depth=3, n_estimators=300; total time=   0.1s
[CV] END ...learning_rate=0.1, max_depth=4, n_estimators=100; total time=   0.0s
[CV] END ...learning_rate=0.1, max_depth=4, n_estimators=100; total time=   0.0s
[CV] END ...learning_rate=0.1, max_depth=4, n_es

GridSearchCV(cv=3,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, device=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False,
                                     eval_metric='logloss', feature_types=None,
                                     gamma=None, grow_policy=None,
                                     importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=...
                                     max_cat_threshold=None,
                                     max_cat_to_onehot=None,
                                     max_delta_step=None, max_depth=3,
                                     max_leaves=None, 

In [26]:
# Get the best model from hyperparameter tuning
best_clf = grid_search.best_estimator_

In [27]:
# Make predictions on the test data
y_pred = best_clf.predict(X_test)

# Calculate accuracy on the test data
accuracy = accuracy_score(y_test, y_pred)

# Print the best hyperparameters and accuracy
print(f'Best Hyperparameters: {grid_search.best_params_}')
print(f'Accuracy: {accuracy * 100:.2f}%')

Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
Accuracy: 94.87%


In [33]:
# Get feature importances
feature_importances = best_clf.feature_importances_


In [34]:
# Create a DataFrame to view feature importances
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)


In [36]:
# Print the most predictive features
print(feature_importance_df.head())

                                             Feature  Importance
5  How knowledgeable are you about artificial int...    0.663998
1  What changes do you think that the functionali...    0.241865
3  Do you think that the functionality would be s...    0.054888
2  Do you think that the functionality would be s...    0.038687
0                                   How old are you?    0.000220


In [35]:
# Print the most predictive feature
most_predictive_feature = feature_importance_df.iloc[0]
print("Most predictive feature:", most_predictive_feature['Feature'], "with importance:", most_predictive_feature['Importance'])

Most predictive feature: How knowledgeable are you about artificial intelligence? with importance: 0.6639984
