In [1]:
!pip install ucimlrepo
!pip install scikit-learn



In [2]:
import pandas as pd
import sklearn
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 
from ucimlrepo import fetch_ucirepo 
from sklearn.model_selection import train_test_split

In [4]:
autistic_spectrum_disorder = fetch_ucirepo(id=419) 
x = autistic_spectrum_disorder.data.features 
y = autistic_spectrum_disorder.data.targets
autistic_df = pd.concat([x, y], axis=1)
autistic_df.head()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,...,gender,ethnicity,jaundice,autism,country_of_res,used_app_before,result,age_desc,relation,class
0,1,1,0,0,1,1,0,1,0,0,...,m,Others,no,no,Jordan,no,5,'4-11 years',Parent,NO
1,1,1,0,0,1,1,0,1,0,0,...,m,'Middle Eastern ',no,no,Jordan,no,5,'4-11 years',Parent,NO
2,1,1,0,0,0,1,1,1,0,0,...,m,,no,no,Jordan,yes,5,'4-11 years',,NO
3,0,1,0,0,1,1,0,0,0,1,...,f,,yes,no,Jordan,no,4,'4-11 years',,NO
4,1,1,1,1,1,1,1,1,1,1,...,m,Others,yes,no,'United States',no,10,'4-11 years',Parent,YES


In [5]:
autistic_df.columns

Index(['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score',
       'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 'age', 'gender',
       'ethnicity', 'jaundice', 'autism', 'country_of_res', 'used_app_before',
       'result', 'age_desc', 'relation', 'class'],
      dtype='object')

In [6]:
autistic_df['autism'].unique()

array(['no', 'yes'], dtype=object)

In [7]:
#checking for any null values
autistic_df.isna().sum()

A1_Score            0
A2_Score            0
A3_Score            0
A4_Score            0
A5_Score            0
A6_Score            0
A7_Score            0
A8_Score            0
A9_Score            0
A10_Score           0
age                 4
gender              0
ethnicity          43
jaundice            0
autism              0
country_of_res      0
used_app_before     0
result              0
age_desc            0
relation           43
class               0
dtype: int64

In [9]:
#since it is hard to predict ethnicity for any model, lets drop these values
autistic_df = autistic_df.dropna(subset=['ethnicity'])
autistic_df.isna().sum()

A1_Score           0
A2_Score           0
A3_Score           0
A4_Score           0
A5_Score           0
A6_Score           0
A7_Score           0
A8_Score           0
A9_Score           0
A10_Score          0
age                1
gender             0
ethnicity          0
jaundice           0
autism             0
country_of_res     0
used_app_before    0
result             0
age_desc           0
relation           0
class              0
dtype: int64

In [11]:
#considering there is only 1 nan value left, let's drop it because 1 entry will not significantly impact a models accuracy and effectiveness
autistic_df = autistic_df.dropna(subset=['age'])
autistic_df.isna().sum()

A1_Score           0
A2_Score           0
A3_Score           0
A4_Score           0
A5_Score           0
A6_Score           0
A7_Score           0
A8_Score           0
A9_Score           0
A10_Score          0
age                0
gender             0
ethnicity          0
jaundice           0
autism             0
country_of_res     0
used_app_before    0
result             0
age_desc           0
relation           0
class              0
dtype: int64

In [12]:
autistic_df.shape

(248, 21)

In [13]:
autistic_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 248 entries, 0 to 291
Data columns (total 21 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   A1_Score         248 non-null    int64  
 1   A2_Score         248 non-null    int64  
 2   A3_Score         248 non-null    int64  
 3   A4_Score         248 non-null    int64  
 4   A5_Score         248 non-null    int64  
 5   A6_Score         248 non-null    int64  
 6   A7_Score         248 non-null    int64  
 7   A8_Score         248 non-null    int64  
 8   A9_Score         248 non-null    int64  
 9   A10_Score        248 non-null    int64  
 10  age              248 non-null    float64
 11  gender           248 non-null    object 
 12  ethnicity        248 non-null    object 
 13  jaundice         248 non-null    object 
 14  autism           248 non-null    object 
 15  country_of_res   248 non-null    object 
 16  used_app_before  248 non-null    object 
 17  result           248 

In [14]:
autistic_df['jaundice'].unique()

array(['no', 'yes'], dtype=object)

In [15]:
autistic_df['ethnicity'].unique()

array(['Others', "'Middle Eastern '", 'White-European', 'Black',
       "'South Asian'", 'Asian', 'Pasifika', 'Hispanic', 'Turkish',
       'Latino'], dtype=object)

In [16]:
my_object_df = autistic_df.select_dtypes(include='object')
my_numeric_df = autistic_df.select_dtypes(exclude='object')

In [17]:
my_object_df.shape

(248, 9)

In [18]:
autistic_df_objects_dummies = pd.get_dummies(my_object_df, drop_first=True).astype(int)
autistic_df_objects_dummies.head()

Unnamed: 0,gender_m,ethnicity_'South Asian',ethnicity_Asian,ethnicity_Black,ethnicity_Hispanic,ethnicity_Latino,ethnicity_Others,ethnicity_Pasifika,ethnicity_Turkish,ethnicity_White-European,...,country_of_res_Romania,country_of_res_Sweden,country_of_res_Syria,country_of_res_Turkey,used_app_before_yes,relation_Parent,relation_Relative,relation_Self,relation_self,class_YES
0,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,1
6,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,1
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1


In [19]:
autistic_df_objects_dummies.columns

Index(['gender_m', 'ethnicity_'South Asian'', 'ethnicity_Asian',
       'ethnicity_Black', 'ethnicity_Hispanic', 'ethnicity_Latino',
       'ethnicity_Others', 'ethnicity_Pasifika', 'ethnicity_Turkish',
       'ethnicity_White-European', 'jaundice_yes', 'autism_yes',
       'country_of_res_'Isle of Man'', 'country_of_res_'New Zealand'',
       'country_of_res_'Saudi Arabia'', 'country_of_res_'South Africa'',
       'country_of_res_'South Korea'',
       'country_of_res_'U.S. Outlying Islands'',
       'country_of_res_'United Arab Emirates'',
       'country_of_res_'United Kingdom'', 'country_of_res_'United States'',
       'country_of_res_Afghanistan', 'country_of_res_Argentina',
       'country_of_res_Armenia', 'country_of_res_Australia',
       'country_of_res_Austria', 'country_of_res_Bahrain',
       'country_of_res_Bangladesh', 'country_of_res_Bhutan',
       'country_of_res_Brazil', 'country_of_res_Bulgaria',
       'country_of_res_Canada', 'country_of_res_Egypt',
       'country

In [20]:
autistic_df = pd.concat([my_numeric_df,autistic_df_objects_dummies], axis=1)
autistic_df

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,...,country_of_res_Romania,country_of_res_Sweden,country_of_res_Syria,country_of_res_Turkey,used_app_before_yes,relation_Parent,relation_Relative,relation_Self,relation_self,class_YES
0,1,1,0,0,1,1,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
1,1,1,0,0,1,1,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,1,0,0,0,1
6,1,0,1,1,1,1,0,1,0,1,...,0,0,0,0,0,1,0,0,0,1
7,1,1,1,1,1,1,1,1,0,0,...,0,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
287,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,1,0,0,0,1
288,1,0,0,0,1,0,1,0,0,1,...,0,0,0,0,0,1,0,0,0,0
289,1,0,1,1,1,1,1,0,0,1,...,0,0,0,0,0,1,0,0,0,1
290,1,1,1,0,1,1,1,1,1,1,...,0,0,0,0,0,1,0,0,0,1


In [21]:
autistic_df.columns

Index(['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score',
       'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 'age', 'result',
       'gender_m', 'ethnicity_'South Asian'', 'ethnicity_Asian',
       'ethnicity_Black', 'ethnicity_Hispanic', 'ethnicity_Latino',
       'ethnicity_Others', 'ethnicity_Pasifika', 'ethnicity_Turkish',
       'ethnicity_White-European', 'jaundice_yes', 'autism_yes',
       'country_of_res_'Isle of Man'', 'country_of_res_'New Zealand'',
       'country_of_res_'Saudi Arabia'', 'country_of_res_'South Africa'',
       'country_of_res_'South Korea'',
       'country_of_res_'U.S. Outlying Islands'',
       'country_of_res_'United Arab Emirates'',
       'country_of_res_'United Kingdom'', 'country_of_res_'United States'',
       'country_of_res_Afghanistan', 'country_of_res_Argentina',
       'country_of_res_Armenia', 'country_of_res_Australia',
       'country_of_res_Austria', 'country_of_res_Bahrain',
       'country_of_res_Bangladesh', 'co

In [22]:
autistic_df.shape

(248, 75)

In [None]:
X = autistic_df.drop('autism_yes',axis=1)
y = autistic_df['autism_yes']

# 70-15-15

## Logistic Regression

In [24]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_validation, X_holdout_test, y_validation, y_holdout_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [25]:
from sklearn.preprocessing import StandardScaler

In [26]:
scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)
scaled_X_holdout_test = scaler.transform(X_holdout_test)
scaled_X_validation = scaler.fit_transform(X_validation)

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [28]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [29]:
lr_param_grid = {'C' : [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 10, 100],
                 'max_iter': [100,500,1000, 1500, 2500, 3000],
                 'solver': ['liblinear','saga']}

In [30]:
log_model = LogisticRegression(random_state=42)
lr_grid_model = GridSearchCV(log_model,param_grid=lr_param_grid)
lr_grid_model.fit(scaled_X_train,y_train)
lr_grid_model.best_params_



{'C': 0.001, 'max_iter': 100, 'solver': 'saga'}

In [31]:
best_lr_model = LogisticRegression(C=0.001, max_iter=100,solver='saga',random_state=42)

best_lr_model.fit(scaled_X_train,y_train)
lr_validation_predictions = best_lr_model.predict(scaled_X_validation)
accuracy_score(y_validation, lr_validation_predictions)

0.8378378378378378

In [32]:
lr_holdout_predictions = best_lr_model.predict(scaled_X_holdout_test)
accuracy_score(y_holdout_test, lr_holdout_predictions)

0.8947368421052632

## Support Vector Machine

In [33]:
from sklearn.svm import SVC

In [34]:
svc_param_grid = {'kernel': ['linear', 'poly','rbf','sigmoid'],
                  'C': [0.05, 0.01, 0.5, 0.1, 1]}

In [35]:
svc_model = SVC(random_state=42)
svc_grid_model = GridSearchCV(svc_model,param_grid=svc_param_grid)
svc_grid_model.fit(scaled_X_train, y_train)
best_svc_model = svc_grid_model.best_params_
best_svc_model

{'C': 0.01, 'kernel': 'linear'}

In [36]:
best_svc_model = SVC(random_state=42,C=0.01, kernel='linear')
best_svc_model.fit(scaled_X_train,y_train)
svc_validation_predictions = best_svc_model.predict(scaled_X_validation)
accuracy_score(y_validation, svc_validation_predictions)

0.8378378378378378

In [37]:
svc_holdout_predictions = best_svc_model.predict(scaled_X_holdout_test)
accuracy_score(y_holdout_test, svc_holdout_predictions)

0.868421052631579

## Random Forest

In [38]:
from sklearn.ensemble import RandomForestClassifier

In [39]:
rfc_param_grid = {'n_estimators': [10, 20, 50,100, 200, 500, 1000],
                  'max_features': ['sqrt', 'log2', None],
                  'bootstrap': [True, False],
                  'oob_score': [True, False]}

In [41]:
rfc_model = RandomForestClassifier(random_state=42)
rfc_grid_model = GridSearchCV(rfc_model, rfc_param_grid)
rfc_grid_model.fit(scaled_X_train,y_train)
best_rfc_model = rfc_grid_model.best_params_
best_rfc_model

105 fits failed out of a total of 420.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
105 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/emanoelagbayani/miniconda3/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/emanoelagbayani/miniconda3/lib/python3.13/site-packages/sklearn/base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/emanoelagbayani/miniconda3/lib/python3.13/site-packages/sklearn/ensemble/_forest.py", line 447, in fit
    raise ValueError("Out of bag estimation only available if bo

{'bootstrap': False,
 'max_features': 'log2',
 'n_estimators': 100,
 'oob_score': False}

In [42]:
best_rfc_model = RandomForestClassifier(random_state=42,bootstrap=False ,max_features='log2',n_estimators=100, oob_score=False)
best_rfc_model.fit(scaled_X_train,y_train)
rfc_validation_predictions = best_rfc_model.predict(scaled_X_validation)
accuracy_score(y_validation, rfc_validation_predictions)

0.8108108108108109

In [43]:
rfc_holdout_predictions = best_rfc_model.predict(scaled_X_holdout_test)
accuracy_score(y_holdout_test, rfc_holdout_predictions)

0.8157894736842105

## K-Nearest Neighbors 


In [44]:
from sklearn.neighbors import KNeighborsClassifier

In [45]:
knn_param_grid = {'n_neighbors' : [1,2,4,5,10,15],
             'weights' : ['uniform','distance'],
             'algorithm' : ['auto','ball_tree','kd_tree','brute']
             }

In [46]:
knn_model = KNeighborsClassifier()
knn_grid_model = GridSearchCV(knn_model, knn_param_grid)
knn_grid_model.fit(scaled_X_train,y_train)
best_knn_model = knn_grid_model.best_params_
best_knn_model

{'algorithm': 'auto', 'n_neighbors': 2, 'weights': 'uniform'}

In [47]:
best_knn_model = KNeighborsClassifier(algorithm='auto',n_neighbors=2,weights='uniform')
best_knn_model.fit(scaled_X_train,y_train)
knn_validation_predictions = best_knn_model.predict(scaled_X_validation)
accuracy_score(y_validation, knn_validation_predictions)

0.7837837837837838

In [48]:
knn_holdout_predictions = best_knn_model.predict(scaled_X_holdout_test)
accuracy_score(y_holdout_test, knn_holdout_predictions)

0.868421052631579

## Gradient Boosted Trees 

In [49]:
from sklearn.ensemble import GradientBoostingClassifier

In [50]:
gb_param_grid = {'n_estimators' : [50,100,500,1000,1500,2000],
             'learning_rate' : [0.01, 0.05, 0.1],
             'max_depth' : [3,4,5,10]}

In [51]:
gb_model = GradientBoostingClassifier(random_state=42)
gb_grid_model = GridSearchCV(gb_model, gb_param_grid)
gb_grid_model.fit(scaled_X_train,y_train)
best_gb_model = gb_grid_model.best_params_
best_gb_model

{'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 50}

In [52]:
best_gb_model = GradientBoostingClassifier(random_state=42, learning_rate=0.05,max_depth=3,n_estimators=50)
best_gb_model.fit(scaled_X_train,y_train)
gb_validation_predictions = best_gb_model.predict(scaled_X_validation)
accuracy_score(y_validation, gb_validation_predictions)

0.8378378378378378

In [53]:
gb_holdout_predictions = best_gb_model.predict(scaled_X_holdout_test)
accuracy_score(y_holdout_test, gb_holdout_predictions)

0.8421052631578947

# 80/20

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

## Logistic Regression

In [55]:
lr_grid_model = GridSearchCV(log_model,param_grid=lr_param_grid)
lr_grid_model.fit(scaled_X_train,y_train)
best_lr_model = lr_grid_model.best_params_
best_lr_model



{'C': 0.001, 'max_iter': 100, 'solver': 'saga'}

In [56]:
best_lr_model = LogisticRegression(random_state=42,C=0.05,max_iter=100,solver='saga')
best_lr_model.fit(scaled_X_train, y_train)
lr_test_predictions = best_lr_model.predict(scaled_X_test)
accuracy_score(y_test, lr_test_predictions)

0.82

## Support Vector Machine

In [57]:
svc_grid_model = GridSearchCV(svc_model,param_grid=svc_param_grid)
svc_grid_model.fit(scaled_X_train,y_train)
best_svc_model = svc_grid_model.best_params_
best_svc_model

{'C': 0.5, 'kernel': 'poly'}

In [58]:
best_svc_model = SVC(random_state=42,C=0.5,kernel='poly')
best_svc_model.fit(scaled_X_train, y_train)
svc_test_predictions=best_svc_model.predict(scaled_X_test)
accuracy_score(y_test, svc_test_predictions)

0.82

## Random Forest

In [59]:
rfc_grid_model = GridSearchCV(rfc_model, rfc_param_grid)
rfc_grid_model.fit(scaled_X_train,y_train)
best_rfc_model = rfc_grid_model.best_params_
best_rfc_model

105 fits failed out of a total of 420.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
105 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/emanoelagbayani/miniconda3/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/emanoelagbayani/miniconda3/lib/python3.13/site-packages/sklearn/base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/emanoelagbayani/miniconda3/lib/python3.13/site-packages/sklearn/ensemble/_forest.py", line 447, in fit
    raise ValueError("Out of bag estimation only available if bo

{'bootstrap': False,
 'max_features': 'log2',
 'n_estimators': 20,
 'oob_score': False}

In [60]:
best_rfc_model = RandomForestClassifier(random_state=42,bootstrap=False,max_features='log2',n_estimators=20,oob_score=False)
best_rfc_model.fit(scaled_X_train, y_train)
rfc_test_predictions=best_rfc_model.predict(scaled_X_test)
accuracy_score(y_test, rfc_test_predictions)

0.8

## KNN

In [61]:
knn_grid_model = GridSearchCV(knn_model, knn_param_grid)
knn_grid_model.fit(scaled_X_train,y_train)
best_knn_model = knn_grid_model.best_params_
best_knn_model

{'algorithm': 'auto', 'n_neighbors': 4, 'weights': 'uniform'}

In [62]:
best_knn_model = KNeighborsClassifier(algorithm='auto',n_neighbors=4,weights='uniform')
best_knn_model.fit(scaled_X_train, y_train)
knn_test_predictions=best_knn_model.predict(scaled_X_test)
accuracy_score(y_test, knn_test_predictions)

0.8

## Gradient Boosted Trees

In [63]:
gb_grid_model = GridSearchCV(gb_model, gb_param_grid)
gb_grid_model.fit(scaled_X_train,y_train)
best_gb_model = gb_grid_model.best_params_
best_gb_model

{'learning_rate': 0.01, 'max_depth': 4, 'n_estimators': 100}

In [64]:
best_gb_model = GradientBoostingClassifier(random_state=42,learning_rate=0.01,max_depth=4,n_estimators=100)
best_gb_model.fit(scaled_X_train, y_train)
gb_test_predictions=best_gb_model.predict(scaled_X_test)
accuracy_score(y_test, gb_test_predictions)

0.84

# 50/50

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)
scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

## Logistic Regression

In [66]:
lr_grid_model = GridSearchCV(log_model,param_grid=lr_param_grid)
lr_grid_model.fit(scaled_X_train,y_train)
best_lr_model = lr_grid_model.best_params_
best_lr_model



{'C': 0.001, 'max_iter': 100, 'solver': 'saga'}

In [67]:
best_lr_model = LogisticRegression(random_state=42,C=0.01,max_iter=100,solver='saga')
best_lr_model.fit(scaled_X_train, y_train)
lr_test_predictions = best_lr_model.predict(scaled_X_test)
accuracy_score(y_test, lr_test_predictions)

0.8145161290322581

## Support Vector Machine

In [68]:
svc_grid_model = GridSearchCV(svc_model,param_grid=svc_param_grid)
svc_grid_model.fit(scaled_X_train,y_train)
best_svc_model = svc_grid_model.best_params_
best_svc_model

{'C': 0.05, 'kernel': 'poly'}

In [69]:
best_svc_model = SVC(random_state=42,C=0.05,kernel='poly')
best_svc_model.fit(scaled_X_train, y_train)
svc_test_predictions=best_svc_model.predict(scaled_X_test)
accuracy_score(y_test, svc_test_predictions)

0.8145161290322581

## Random Forest

In [70]:
rfc_grid_model = GridSearchCV(rfc_model, rfc_param_grid)
rfc_grid_model.fit(scaled_X_train,y_train)
best_rfc_model = rfc_grid_model.best_params_
best_rfc_model

105 fits failed out of a total of 420.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
105 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/emanoelagbayani/miniconda3/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/emanoelagbayani/miniconda3/lib/python3.13/site-packages/sklearn/base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/emanoelagbayani/miniconda3/lib/python3.13/site-packages/sklearn/ensemble/_forest.py", line 447, in fit
    raise ValueError("Out of bag estimation only available if bo

{'bootstrap': True,
 'max_features': 'log2',
 'n_estimators': 50,
 'oob_score': True}

In [71]:
best_rfc_model = RandomForestClassifier(random_state=42,bootstrap=True,max_features='log2',n_estimators=50,oob_score=True)
best_rfc_model.fit(scaled_X_train, y_train)
rfc_test_predictions=best_rfc_model.predict(scaled_X_test)
accuracy_score(y_test, rfc_test_predictions)

0.8064516129032258

## KNN

In [72]:
knn_grid_model = GridSearchCV(knn_model, knn_param_grid)
knn_grid_model.fit(scaled_X_train,y_train)
best_knn_model = knn_grid_model.best_params_
best_knn_model

{'algorithm': 'auto', 'n_neighbors': 4, 'weights': 'uniform'}

In [73]:
best_knn_model = KNeighborsClassifier(algorithm='auto',n_neighbors=4,weights='uniform')
best_knn_model.fit(scaled_X_train, y_train)
knn_test_predictions=best_knn_model.predict(scaled_X_test)
accuracy_score(y_test, knn_test_predictions)

0.8145161290322581

## Gradient Boosted Trees

In [74]:
gb_grid_model = GridSearchCV(gb_model, gb_param_grid)
gb_grid_model.fit(scaled_X_train,y_train)
best_gb_model = gb_grid_model.best_params_
best_gb_model

{'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50}

In [75]:
best_gb_model = GradientBoostingClassifier(random_state=42,learning_rate=0.01,max_depth=3,n_estimators=50)
best_gb_model.fit(scaled_X_train, y_train)
gb_test_predictions=best_gb_model.predict(scaled_X_test)
accuracy_score(y_test, gb_test_predictions)

0.8145161290322581

# Maternal Health Risk

In [77]:
maternal_health_risk = fetch_ucirepo(id=863) 
x = maternal_health_risk.data.features 
y = maternal_health_risk.data.targets 
maternal_df = pd.concat([x, y], axis=1)
maternal_df.head()

Unnamed: 0,Age,SystolicBP,DiastolicBP,BS,BodyTemp,HeartRate,RiskLevel
0,25,130,80,15.0,98.0,86,high risk
1,35,140,90,13.0,98.0,70,high risk
2,29,90,70,8.0,100.0,80,high risk
3,30,140,85,7.0,98.0,70,high risk
4,35,120,60,6.1,98.0,76,low risk


In [78]:
maternal_df.shape

(1014, 7)

In [79]:
maternal_df['RiskLevel'].unique()

array(['high risk', 'low risk', 'mid risk'], dtype=object)

In [80]:
def risklevel_conv(string):
    if string =='high risk':
        return 1
    elif string =='mid risk':
        return 0 
    elif string =='low risk':
        return -1
    else:
        return None

In [81]:
maternal_df['RiskLevel'] = maternal_df['RiskLevel'].apply(risklevel_conv)
maternal_df.head()

Unnamed: 0,Age,SystolicBP,DiastolicBP,BS,BodyTemp,HeartRate,RiskLevel
0,25,130,80,15.0,98.0,86,1
1,35,140,90,13.0,98.0,70,1
2,29,90,70,8.0,100.0,80,1
3,30,140,85,7.0,98.0,70,1
4,35,120,60,6.1,98.0,76,-1


In [82]:
maternal_df['RiskLevel'].unique()

array([ 1, -1,  0])

In [83]:
maternal_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1014 entries, 0 to 1013
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          1014 non-null   int64  
 1   SystolicBP   1014 non-null   int64  
 2   DiastolicBP  1014 non-null   int64  
 3   BS           1014 non-null   float64
 4   BodyTemp     1014 non-null   float64
 5   HeartRate    1014 non-null   int64  
 6   RiskLevel    1014 non-null   int64  
dtypes: float64(2), int64(5)
memory usage: 55.6 KB


In [None]:
X = maternal_df.remove(columns='RiskLevel'])
Y = maternal_df['RiskLevel']

In [None]:
#Doctor Prediction

In [None]:
national_poll_on_healthy_aging = fetch_ucirepo(id=936) 
x = national_poll_on_healthy_aging.data.features 
y = national_poll_on_healthy_aging.data.targets 

In [None]:
doctor_df = pd.concat([x, y], axis=1)
doctor_df.head()

In [None]:
doctor_df.shape

In [None]:
#Cirrhosis

In [None]:
cirrhosis_patient_survival = fetch_ucirepo(id=878) 
x = cirrhosis_patient_survival.data.features 
y = cirrhosis_patient_survival.data.targets 

In [None]:
cirrhosis_df = pd.concat([x, y], axis=1)
cirrhosis_df.head()

In [None]:
cirrhosis_df.shape

In [None]:
risk_factor_prediction_of = fetch_ucirepo(id=857)
x = risk_factor_prediction_of.data.features 
y = risk_factor_prediction_of.data.targets 

In [None]:
kidney_df = pd.concat([x, y], axis=1)
kidney_df.head()

In [None]:
kidney_df.columns

In [None]:
kidney_df['pot']

In [None]:
kidney_df['hemo']