In [1]:
!pip install ucimlrepo
!pip install scikit-learn



In [2]:
import pandas as pd
import sklearn
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 
from ucimlrepo import fetch_ucirepo 
from sklearn.model_selection import train_test_split

In [3]:
autistic_spectrum_disorder = fetch_ucirepo(id=419) 
x = autistic_spectrum_disorder.data.features 
y = autistic_spectrum_disorder.data.targets
autistic_df = pd.concat([x, y], axis=1)
autistic_df.head()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,...,gender,ethnicity,jaundice,autism,country_of_res,used_app_before,result,age_desc,relation,class
0,1,1,0,0,1,1,0,1,0,0,...,m,Others,no,no,Jordan,no,5,'4-11 years',Parent,NO
1,1,1,0,0,1,1,0,1,0,0,...,m,'Middle Eastern ',no,no,Jordan,no,5,'4-11 years',Parent,NO
2,1,1,0,0,0,1,1,1,0,0,...,m,,no,no,Jordan,yes,5,'4-11 years',,NO
3,0,1,0,0,1,1,0,0,0,1,...,f,,yes,no,Jordan,no,4,'4-11 years',,NO
4,1,1,1,1,1,1,1,1,1,1,...,m,Others,yes,no,'United States',no,10,'4-11 years',Parent,YES


In [4]:
autistic_df.columns

Index(['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score',
       'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 'age', 'gender',
       'ethnicity', 'jaundice', 'autism', 'country_of_res', 'used_app_before',
       'result', 'age_desc', 'relation', 'class'],
      dtype='object')

In [5]:
autistic_df['autism'].unique()

array(['no', 'yes'], dtype=object)

In [6]:
#checking for any null values
autistic_df.isna().sum()

A1_Score            0
A2_Score            0
A3_Score            0
A4_Score            0
A5_Score            0
A6_Score            0
A7_Score            0
A8_Score            0
A9_Score            0
A10_Score           0
age                 4
gender              0
ethnicity          43
jaundice            0
autism              0
country_of_res      0
used_app_before     0
result              0
age_desc            0
relation           43
class               0
dtype: int64

In [7]:
#since it is hard to predict ethnicity for any model, lets drop these values
autistic_df = autistic_df.dropna(subset=['ethnicity'])
autistic_df.isna().sum()

A1_Score           0
A2_Score           0
A3_Score           0
A4_Score           0
A5_Score           0
A6_Score           0
A7_Score           0
A8_Score           0
A9_Score           0
A10_Score          0
age                1
gender             0
ethnicity          0
jaundice           0
autism             0
country_of_res     0
used_app_before    0
result             0
age_desc           0
relation           0
class              0
dtype: int64

In [8]:
#considering there is only 1 nan value left, let's drop it because 1 entry will not significantly impact a models accuracy and effectiveness
autistic_df = autistic_df.dropna(subset=['age'])
autistic_df.isna().sum()

A1_Score           0
A2_Score           0
A3_Score           0
A4_Score           0
A5_Score           0
A6_Score           0
A7_Score           0
A8_Score           0
A9_Score           0
A10_Score          0
age                0
gender             0
ethnicity          0
jaundice           0
autism             0
country_of_res     0
used_app_before    0
result             0
age_desc           0
relation           0
class              0
dtype: int64

In [9]:
autistic_df.shape

(248, 21)

In [10]:
autistic_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 248 entries, 0 to 291
Data columns (total 21 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   A1_Score         248 non-null    int64  
 1   A2_Score         248 non-null    int64  
 2   A3_Score         248 non-null    int64  
 3   A4_Score         248 non-null    int64  
 4   A5_Score         248 non-null    int64  
 5   A6_Score         248 non-null    int64  
 6   A7_Score         248 non-null    int64  
 7   A8_Score         248 non-null    int64  
 8   A9_Score         248 non-null    int64  
 9   A10_Score        248 non-null    int64  
 10  age              248 non-null    float64
 11  gender           248 non-null    object 
 12  ethnicity        248 non-null    object 
 13  jaundice         248 non-null    object 
 14  autism           248 non-null    object 
 15  country_of_res   248 non-null    object 
 16  used_app_before  248 non-null    object 
 17  result           248 

In [11]:
autistic_df['jaundice'].unique()

array(['no', 'yes'], dtype=object)

In [12]:
autistic_df['ethnicity'].unique()

array(['Others', "'Middle Eastern '", 'White-European', 'Black',
       "'South Asian'", 'Asian', 'Pasifika', 'Hispanic', 'Turkish',
       'Latino'], dtype=object)

In [13]:
my_object_df = autistic_df.select_dtypes(include='object')
my_numeric_df = autistic_df.select_dtypes(exclude='object')

In [14]:
my_object_df.shape

(248, 9)

In [15]:
autistic_df_objects_dummies = pd.get_dummies(my_object_df, drop_first=True).astype(int)
autistic_df_objects_dummies.head()

Unnamed: 0,gender_m,ethnicity_'South Asian',ethnicity_Asian,ethnicity_Black,ethnicity_Hispanic,ethnicity_Latino,ethnicity_Others,ethnicity_Pasifika,ethnicity_Turkish,ethnicity_White-European,...,country_of_res_Romania,country_of_res_Sweden,country_of_res_Syria,country_of_res_Turkey,used_app_before_yes,relation_Parent,relation_Relative,relation_Self,relation_self,class_YES
0,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,1
6,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,1
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1


In [16]:
autistic_df_objects_dummies.columns

Index(['gender_m', 'ethnicity_'South Asian'', 'ethnicity_Asian',
       'ethnicity_Black', 'ethnicity_Hispanic', 'ethnicity_Latino',
       'ethnicity_Others', 'ethnicity_Pasifika', 'ethnicity_Turkish',
       'ethnicity_White-European', 'jaundice_yes', 'autism_yes',
       'country_of_res_'Isle of Man'', 'country_of_res_'New Zealand'',
       'country_of_res_'Saudi Arabia'', 'country_of_res_'South Africa'',
       'country_of_res_'South Korea'',
       'country_of_res_'U.S. Outlying Islands'',
       'country_of_res_'United Arab Emirates'',
       'country_of_res_'United Kingdom'', 'country_of_res_'United States'',
       'country_of_res_Afghanistan', 'country_of_res_Argentina',
       'country_of_res_Armenia', 'country_of_res_Australia',
       'country_of_res_Austria', 'country_of_res_Bahrain',
       'country_of_res_Bangladesh', 'country_of_res_Bhutan',
       'country_of_res_Brazil', 'country_of_res_Bulgaria',
       'country_of_res_Canada', 'country_of_res_Egypt',
       'country

In [17]:
autistic_df = pd.concat([my_numeric_df,autistic_df_objects_dummies], axis=1)
autistic_df

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,...,country_of_res_Romania,country_of_res_Sweden,country_of_res_Syria,country_of_res_Turkey,used_app_before_yes,relation_Parent,relation_Relative,relation_Self,relation_self,class_YES
0,1,1,0,0,1,1,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
1,1,1,0,0,1,1,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,1,0,0,0,1
6,1,0,1,1,1,1,0,1,0,1,...,0,0,0,0,0,1,0,0,0,1
7,1,1,1,1,1,1,1,1,0,0,...,0,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
287,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,1,0,0,0,1
288,1,0,0,0,1,0,1,0,0,1,...,0,0,0,0,0,1,0,0,0,0
289,1,0,1,1,1,1,1,0,0,1,...,0,0,0,0,0,1,0,0,0,1
290,1,1,1,0,1,1,1,1,1,1,...,0,0,0,0,0,1,0,0,0,1


In [18]:
autistic_df.columns

Index(['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score',
       'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 'age', 'result',
       'gender_m', 'ethnicity_'South Asian'', 'ethnicity_Asian',
       'ethnicity_Black', 'ethnicity_Hispanic', 'ethnicity_Latino',
       'ethnicity_Others', 'ethnicity_Pasifika', 'ethnicity_Turkish',
       'ethnicity_White-European', 'jaundice_yes', 'autism_yes',
       'country_of_res_'Isle of Man'', 'country_of_res_'New Zealand'',
       'country_of_res_'Saudi Arabia'', 'country_of_res_'South Africa'',
       'country_of_res_'South Korea'',
       'country_of_res_'U.S. Outlying Islands'',
       'country_of_res_'United Arab Emirates'',
       'country_of_res_'United Kingdom'', 'country_of_res_'United States'',
       'country_of_res_Afghanistan', 'country_of_res_Argentina',
       'country_of_res_Armenia', 'country_of_res_Australia',
       'country_of_res_Austria', 'country_of_res_Bahrain',
       'country_of_res_Bangladesh', 'co

In [19]:
autistic_df.shape

(248, 75)

In [20]:
X = autistic_df.drop('autism_yes',axis=1)
y = autistic_df['autism_yes']

# 70-15-15

## Logistic Regression

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_validation, X_holdout_test, y_validation, y_holdout_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [22]:
from sklearn.preprocessing import StandardScaler

In [23]:
scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)
scaled_X_holdout_test = scaler.transform(X_holdout_test)
scaled_X_validation = scaler.fit_transform(X_validation)

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [25]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [26]:
lr_param_grid = {'C' : [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 10, 100],
                 'max_iter': [100,500,1000, 1500, 2500, 3000],
                 'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
                'penalty': ['l1', 'l2', 'elasticnet', None]
                }

In [27]:
log_model = LogisticRegression(random_state=42)
lr_grid_model = GridSearchCV(log_model,param_grid=lr_param_grid)
lr_grid_model.fit(scaled_X_train,y_train)
lr_grid_model.best_params_

Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Matrix is singular.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Matrix is singular.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Matrix is singular.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Matrix is singular.
Further opti

{'C': 0.001, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}

In [38]:
best_lr_model = LogisticRegression(C=0.001, max_iter=100,penalty='l1',solver='liblinear',random_state=42)
best_lr_model.fit(scaled_X_train,y_train)
lr_validation_predictions = best_lr_model.predict(scaled_X_validation)
accuracy_score(y_validation, lr_validation_predictions)

0.8378378378378378

In [39]:
lr_holdout_predictions = best_lr_model.predict(scaled_X_holdout_test)
accuracy_score(y_holdout_test, lr_holdout_predictions)

0.8947368421052632

## Support Vector Machine

In [30]:
from sklearn.svm import SVC

In [31]:
svc_param_grid = {'kernel': ['linear', 'poly','rbf','sigmoid'],
                  'C': [0.05, 0.01, 0.5, 0.1, 1],
                 'gamma':['scale', 'auto'],
                 }

In [32]:
svc_model = SVC(random_state=42)
svc_grid_model = GridSearchCV(svc_model,param_grid=svc_param_grid)
svc_grid_model.fit(scaled_X_train, y_train)
best_svc_model = svc_grid_model.best_params_
best_svc_model

{'C': 0.01, 'gamma': 'scale', 'kernel': 'linear'}

In [40]:
best_svc_model = SVC(random_state=42,C=0.01,gamma='scale',kernel='linear')
best_svc_model.fit(scaled_X_train,y_train)
svc_validation_predictions = best_svc_model.predict(scaled_X_validation)
accuracy_score(y_validation, svc_validation_predictions)

0.8378378378378378

In [41]:
svc_holdout_predictions = best_svc_model.predict(scaled_X_holdout_test)
accuracy_score(y_holdout_test, svc_holdout_predictions)

0.868421052631579

## Random Forest

In [42]:
from sklearn.ensemble import RandomForestClassifier

In [43]:
rfc_param_grid = {'n_estimators': [10, 20, 50,100, 200, 500, 1000],
                  'max_features': ['sqrt', 'log2', None],
                  'bootstrap': [True, False],
                  'oob_score': [True, False]}

In [44]:
rfc_model = RandomForestClassifier(random_state=42)
rfc_grid_model = GridSearchCV(rfc_model, rfc_param_grid)
rfc_grid_model.fit(scaled_X_train,y_train)
best_rfc_model = rfc_grid_model.best_params_
best_rfc_model

105 fits failed out of a total of 420.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
105 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/emanoelagbayani/miniconda3/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/emanoelagbayani/miniconda3/lib/python3.13/site-packages/sklearn/base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/emanoelagbayani/miniconda3/lib/python3.13/site-packages/sklearn/ensemble/_forest.py", line 447, in fit
    raise ValueError("Out of bag estimation only available if bo

{'bootstrap': False,
 'max_features': 'log2',
 'n_estimators': 100,
 'oob_score': False}

In [45]:
best_rfc_model = RandomForestClassifier(random_state=42,bootstrap=False ,max_features='log2',n_estimators=100, oob_score=False)
best_rfc_model.fit(scaled_X_train,y_train)
rfc_validation_predictions = best_rfc_model.predict(scaled_X_validation)
accuracy_score(y_validation, rfc_validation_predictions)

0.8108108108108109

In [46]:
rfc_holdout_predictions = best_rfc_model.predict(scaled_X_holdout_test)
accuracy_score(y_holdout_test, rfc_holdout_predictions)

0.8157894736842105

## K-Nearest Neighbors 


In [47]:
from sklearn.neighbors import KNeighborsClassifier

In [48]:
knn_param_grid = {'n_neighbors' : [1,2,4,5,10,15],
             'weights' : ['uniform','distance'],
             'algorithm' : ['auto','ball_tree','kd_tree','brute']
             }

In [49]:
knn_model = KNeighborsClassifier()
knn_grid_model = GridSearchCV(knn_model, knn_param_grid)
knn_grid_model.fit(scaled_X_train,y_train)
best_knn_model = knn_grid_model.best_params_
best_knn_model

{'algorithm': 'auto', 'n_neighbors': 2, 'weights': 'uniform'}

In [50]:
best_knn_model = KNeighborsClassifier(algorithm='auto',n_neighbors=2,weights='uniform')
best_knn_model.fit(scaled_X_train,y_train)
knn_validation_predictions = best_knn_model.predict(scaled_X_validation)
accuracy_score(y_validation, knn_validation_predictions)

0.7837837837837838

In [51]:
knn_holdout_predictions = best_knn_model.predict(scaled_X_holdout_test)
accuracy_score(y_holdout_test, knn_holdout_predictions)

0.868421052631579

## Gradient Boosted Trees 

In [52]:
from sklearn.ensemble import GradientBoostingClassifier

In [124]:
gb_param_grid = {'n_estimators' : [50,100,500,1000],
             'learning_rate' : [0.01, 0.05, 0.1],
             'max_depth' : [3,4,5,10]}

In [54]:
gb_model = GradientBoostingClassifier(random_state=42)
gb_grid_model = GridSearchCV(gb_model, gb_param_grid)
gb_grid_model.fit(scaled_X_train,y_train)
best_gb_model = gb_grid_model.best_params_
best_gb_model

KeyboardInterrupt: 

In [None]:
best_gb_model = GradientBoostingClassifier(random_state=42, learning_rate=0.05,max_depth=3,n_estimators=50)
best_gb_model.fit(scaled_X_train,y_train)
gb_validation_predictions = best_gb_model.predict(scaled_X_validation)
accuracy_score(y_validation, gb_validation_predictions)

In [None]:
gb_holdout_predictions = best_gb_model.predict(scaled_X_holdout_test)
accuracy_score(y_holdout_test, gb_holdout_predictions)

# 80/20

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

## Logistic Regression

In [56]:
lr_grid_model = GridSearchCV(log_model,param_grid=lr_param_grid)
lr_grid_model.fit(scaled_X_train,y_train)
best_lr_model = lr_grid_model.best_params_
best_lr_model

Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Matrix is singular.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Matrix is singular.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Matrix is singular.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Matrix is singular.
Further opti

{'C': 0.001, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}

In [59]:
best_lr_model = LogisticRegression(random_state=42,C=0.001,max_iter=100,penalty='l1',solver='liblinear')
best_lr_model.fit(scaled_X_train, y_train)
lr_test_predictions = best_lr_model.predict(scaled_X_test)
accuracy_score(y_test, lr_test_predictions)

0.84

## Support Vector Machine

In [60]:
svc_grid_model = GridSearchCV(svc_model,param_grid=svc_param_grid)
svc_grid_model.fit(scaled_X_train,y_train)
best_svc_model = svc_grid_model.best_params_
best_svc_model

{'C': 0.5, 'gamma': 'scale', 'kernel': 'poly'}

In [62]:
best_svc_model = SVC(random_state=42,C=0.5,gamma='scale',kernel='poly')
best_svc_model.fit(scaled_X_train, y_train)
svc_test_predictions=best_svc_model.predict(scaled_X_test)
accuracy_score(y_test, svc_test_predictions)

0.82

## Random Forest

In [None]:
rfc_grid_model = GridSearchCV(rfc_model, rfc_param_grid)
rfc_grid_model.fit(scaled_X_train,y_train)
best_rfc_model = rfc_grid_model.best_params_
best_rfc_model

In [None]:
best_rfc_model = RandomForestClassifier(random_state=42,bootstrap=False,max_features='log2',n_estimators=20,oob_score=False)
best_rfc_model.fit(scaled_X_train, y_train)
rfc_test_predictions=best_rfc_model.predict(scaled_X_test)
accuracy_score(y_test, rfc_test_predictions)

## KNN

In [None]:
knn_grid_model = GridSearchCV(knn_model, knn_param_grid)
knn_grid_model.fit(scaled_X_train,y_train)
best_knn_model = knn_grid_model.best_params_
best_knn_model

In [None]:
best_knn_model = KNeighborsClassifier(algorithm='auto',n_neighbors=4,weights='uniform')
best_knn_model.fit(scaled_X_train, y_train)
knn_test_predictions=best_knn_model.predict(scaled_X_test)
accuracy_score(y_test, knn_test_predictions)

## Gradient Boosted Trees

In [None]:
gb_grid_model = GridSearchCV(gb_model, gb_param_grid)
gb_grid_model.fit(scaled_X_train,y_train)
best_gb_model = gb_grid_model.best_params_
best_gb_model

In [None]:
best_gb_model = GradientBoostingClassifier(random_state=42,learning_rate=0.01,max_depth=4,n_estimators=100)
best_gb_model.fit(scaled_X_train, y_train)
gb_test_predictions=best_gb_model.predict(scaled_X_test)
accuracy_score(y_test, gb_test_predictions)

# 50/50

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)
scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

## Logistic Regression

In [66]:
lr_grid_model = GridSearchCV(log_model,param_grid=lr_param_grid)
lr_grid_model.fit(scaled_X_train,y_train)
best_lr_model = lr_grid_model.best_params_
best_lr_model

Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Matrix is singular.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Matrix is singular.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Matrix is singular.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Matrix is singular.
Further opti

{'C': 0.001, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}


2970 fits failed out of a total of 6480.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
270 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/emanoelagbayani/miniconda3/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/emanoelagbayani/miniconda3/lib/python3.13/site-packages/sklearn/base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/emanoelagbayani/miniconda3/lib/python3.13/site-packages/sklearn/linear_model/_logistic.py", line 1218, in fit
    solver = _check_solver(self.solver, self.penalty, s

In [68]:
best_lr_model = LogisticRegression(random_state=42,C=0.001,max_iter=100,penalty='l1',solver='liblinear')
best_lr_model.fit(scaled_X_train, y_train)
lr_test_predictions = best_lr_model.predict(scaled_X_test)
accuracy_score(y_test, lr_test_predictions)

0.8145161290322581

## Support Vector Machine

In [69]:
svc_grid_model = GridSearchCV(svc_model,param_grid=svc_param_grid)
svc_grid_model.fit(scaled_X_train,y_train)
best_svc_model = svc_grid_model.best_params_
best_svc_model

{'C': 0.05, 'gamma': 'scale', 'kernel': 'poly'}

In [70]:
best_svc_model = SVC(random_state=42,C=0.05,gamma='scale',kernel='poly')
best_svc_model.fit(scaled_X_train, y_train)
svc_test_predictions=best_svc_model.predict(scaled_X_test)
accuracy_score(y_test, svc_test_predictions)

0.8145161290322581

## Random Forest

In [None]:
rfc_grid_model = GridSearchCV(rfc_model, rfc_param_grid)
rfc_grid_model.fit(scaled_X_train,y_train)
best_rfc_model = rfc_grid_model.best_params_
best_rfc_model

In [None]:
best_rfc_model = RandomForestClassifier(random_state=42,bootstrap=True,max_features='log2',n_estimators=50,oob_score=True)
best_rfc_model.fit(scaled_X_train, y_train)
rfc_test_predictions=best_rfc_model.predict(scaled_X_test)
accuracy_score(y_test, rfc_test_predictions)

## KNN

In [None]:
knn_grid_model = GridSearchCV(knn_model, knn_param_grid)
knn_grid_model.fit(scaled_X_train,y_train)
best_knn_model = knn_grid_model.best_params_
best_knn_model

In [None]:
best_knn_model = KNeighborsClassifier(algorithm='auto',n_neighbors=4,weights='uniform')
best_knn_model.fit(scaled_X_train, y_train)
knn_test_predictions=best_knn_model.predict(scaled_X_test)
accuracy_score(y_test, knn_test_predictions)

## Gradient Boosted Trees

In [None]:
gb_grid_model = GridSearchCV(gb_model, gb_param_grid)
gb_grid_model.fit(scaled_X_train,y_train)
best_gb_model = gb_grid_model.best_params_
best_gb_model

In [None]:
best_gb_model = GradientBoostingClassifier(random_state=42,learning_rate=0.01,max_depth=3,n_estimators=50)
best_gb_model.fit(scaled_X_train, y_train)
gb_test_predictions=best_gb_model.predict(scaled_X_test)
accuracy_score(y_test, gb_test_predictions)

# Maternal Health Risk

In [85]:
maternal_health_risk = fetch_ucirepo(id=863) 
x = maternal_health_risk.data.features 
y = maternal_health_risk.data.targets 
maternal_df = pd.concat([x, y], axis=1)
maternal_df.head()

ERROR! Session/line number was not unique in database. History logging moved to new session 581


Unnamed: 0,Age,SystolicBP,DiastolicBP,BS,BodyTemp,HeartRate,RiskLevel
0,25,130,80,15.0,98.0,86,high risk
1,35,140,90,13.0,98.0,70,high risk
2,29,90,70,8.0,100.0,80,high risk
3,30,140,85,7.0,98.0,70,high risk
4,35,120,60,6.1,98.0,76,low risk


In [86]:
maternal_df.shape

(1014, 7)

In [87]:
maternal_df['RiskLevel'].unique()

array(['high risk', 'low risk', 'mid risk'], dtype=object)

In [88]:
def risklevel_conv(string):
    if string =='high risk':
        return 1
    elif string =='mid risk':
        return 0 
    elif string =='low risk':
        return -1
    else:
        return None

In [89]:
maternal_df['RiskLevel'] = maternal_df['RiskLevel'].apply(risklevel_conv)
maternal_df.head()

Unnamed: 0,Age,SystolicBP,DiastolicBP,BS,BodyTemp,HeartRate,RiskLevel
0,25,130,80,15.0,98.0,86,1
1,35,140,90,13.0,98.0,70,1
2,29,90,70,8.0,100.0,80,1
3,30,140,85,7.0,98.0,70,1
4,35,120,60,6.1,98.0,76,-1


In [90]:
maternal_df['RiskLevel'].unique()

array([ 1, -1,  0])

In [91]:
maternal_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1014 entries, 0 to 1013
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          1014 non-null   int64  
 1   SystolicBP   1014 non-null   int64  
 2   DiastolicBP  1014 non-null   int64  
 3   BS           1014 non-null   float64
 4   BodyTemp     1014 non-null   float64
 5   HeartRate    1014 non-null   int64  
 6   RiskLevel    1014 non-null   int64  
dtypes: float64(2), int64(5)
memory usage: 55.6 KB


In [92]:
X = maternal_df.drop(columns='RiskLevel',axis=1)
y = maternal_df['RiskLevel']

# 70/15/15

In [93]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_validation, X_holdout_test, y_validation, y_holdout_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [94]:
scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)
scaled_X_holdout_test = scaler.transform(X_holdout_test)
scaled_X_validation = scaler.fit_transform(X_validation)

## Logistic Regression

In [95]:
log_model = LogisticRegression(random_state=42)
lr_grid_model = GridSearchCV(log_model,param_grid=lr_param_grid)
lr_grid_model.fit(scaled_X_train,y_train)
lr_grid_model.best_params_

2970 fits failed out of a total of 6480.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
270 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/emanoelagbayani/miniconda3/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/emanoelagbayani/miniconda3/lib/python3.13/site-packages/sklearn/base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/emanoelagbayani/miniconda3/lib/python3.13/site-packages/sklearn/linear_model/_logistic.py", line 1218, in fit
    solver = _check_solver(self.solver, self.penalty, s

{'C': 0.05, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}

In [96]:
best_lr_model = LogisticRegression(C=0.05, max_iter=100,penalty='l2',solver='liblinear',random_state=42)
best_lr_model.fit(scaled_X_train,y_train)
lr_validation_predictions = best_lr_model.predict(scaled_X_validation)
accuracy_score(y_validation, lr_validation_predictions)



0.618421052631579

In [97]:
lr_holdout_predictions = best_lr_model.predict(scaled_X_holdout_test)
accuracy_score(y_holdout_test, lr_holdout_predictions)

0.6535947712418301

## Support Vector Machine

In [98]:
svc_model = SVC(random_state=42)
svc_grid_model = GridSearchCV(svc_model,param_grid=svc_param_grid)
svc_grid_model.fit(scaled_X_train, y_train)
best_svc_model = svc_grid_model.best_params_
best_svc_model

{'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}

In [99]:
best_svc_model = SVC(random_state=42,C=1,gamma='scale',kernel='rbf')
best_svc_model.fit(scaled_X_train,y_train)
svc_validation_predictions = best_svc_model.predict(scaled_X_validation)
accuracy_score(y_validation, svc_validation_predictions)

0.6776315789473685

In [100]:
svc_holdout_predictions = best_svc_model.predict(scaled_X_holdout_test)
accuracy_score(y_holdout_test, svc_holdout_predictions)

0.7254901960784313

## Random Forest

In [101]:
rfc_model = RandomForestClassifier(random_state=42)
rfc_grid_model = GridSearchCV(rfc_model, rfc_param_grid)
rfc_grid_model.fit(scaled_X_train,y_train)
best_rfc_model = rfc_grid_model.best_params_
best_rfc_model

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
105 fits failed out of a total of 420.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
105 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/emanoelagbayani/miniconda3/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/emanoelagbayani/miniconda3/lib/python3.13/site-packages/sklearn/base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/emanoelagbayani/miniconda3/lib/python3.13/site-

{'bootstrap': False,
 'max_features': 'sqrt',
 'n_estimators': 200,
 'oob_score': False}

In [102]:
best_rfc_model = RandomForestClassifier(random_state=42,bootstrap=False ,max_features='sqrt',n_estimators=200, oob_score=False)
best_rfc_model.fit(scaled_X_train,y_train)
rfc_validation_predictions = best_rfc_model.predict(scaled_X_validation)
accuracy_score(y_validation, rfc_validation_predictions)

0.6710526315789473

In [103]:
rfc_holdout_predictions = best_rfc_model.predict(scaled_X_holdout_test)
accuracy_score(y_holdout_test, rfc_holdout_predictions)

0.7581699346405228

## K-Nearest Neighbors

In [104]:
knn_model = KNeighborsClassifier()
knn_grid_model = GridSearchCV(knn_model, knn_param_grid)
knn_grid_model.fit(scaled_X_train,y_train)
best_knn_model = knn_grid_model.best_params_
best_knn_model

{'algorithm': 'auto', 'n_neighbors': 15, 'weights': 'distance'}

In [105]:
best_knn_model = KNeighborsClassifier(algorithm='auto',n_neighbors=15,weights='distance')
best_knn_model.fit(scaled_X_train,y_train)
knn_validation_predictions = best_knn_model.predict(scaled_X_validation)
accuracy_score(y_validation, knn_validation_predictions)

0.7105263157894737

In [106]:
knn_holdout_predictions = best_knn_model.predict(scaled_X_holdout_test)
accuracy_score(y_holdout_test, knn_holdout_predictions)

0.7516339869281046

# Gradient Boosted Trees

In [107]:
gb_model = GradientBoostingClassifier(random_state=42)
gb_grid_model = GridSearchCV(gb_model, gb_param_grid)
gb_grid_model.fit(scaled_X_train,y_train)
best_gb_model = gb_grid_model.best_params_
best_gb_model

{'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 2000}

In [108]:
best_gb_model = GradientBoostingClassifier(random_state=42, learning_rate=0.01,max_depth=5,n_estimators=2000)
best_gb_model.fit(scaled_X_train,y_train)
gb_validation_predictions = best_gb_model.predict(scaled_X_validation)
accuracy_score(y_validation, gb_validation_predictions)

0.6644736842105263

In [109]:
gb_holdout_predictions = best_gb_model.predict(scaled_X_holdout_test)
accuracy_score(y_holdout_test, gb_holdout_predictions)

0.7777777777777778

# 80/20

In [110]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

## Logistic Regression

In [111]:
lr_grid_model = GridSearchCV(log_model,param_grid=lr_param_grid)
lr_grid_model.fit(scaled_X_train,y_train)
best_lr_model = lr_grid_model.best_params_
best_lr_model

2970 fits failed out of a total of 6480.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
270 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/emanoelagbayani/miniconda3/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/emanoelagbayani/miniconda3/lib/python3.13/site-packages/sklearn/base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/emanoelagbayani/miniconda3/lib/python3.13/site-packages/sklearn/linear_model/_logistic.py", line 1218, in fit
    solver = _check_solver(self.solver, self.penalty, s

{'C': 0.1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}

In [112]:
best_lr_model = LogisticRegression(random_state=42,C=0.1,max_iter=100,penalty='l2',solver='liblinear')
best_lr_model.fit(scaled_X_train, y_train)
lr_test_predictions = best_lr_model.predict(scaled_X_test)
accuracy_score(y_test, lr_test_predictions)



0.6502463054187192

## SVM

In [113]:
svc_grid_model = GridSearchCV(svc_model,param_grid=svc_param_grid)
svc_grid_model.fit(scaled_X_train,y_train)
best_svc_model = svc_grid_model.best_params_
best_svc_model

{'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}

In [114]:
best_svc_model = SVC(random_state=42,C=1,gamma='scale',kernel='rbf')
best_svc_model.fit(scaled_X_train, y_train)
svc_test_predictions=best_svc_model.predict(scaled_X_test)
accuracy_score(y_test, svc_test_predictions)

0.6798029556650246

## Random Forest

In [115]:
rfc_grid_model = GridSearchCV(rfc_model, rfc_param_grid)
rfc_grid_model.fit(scaled_X_train,y_train)
best_rfc_model = rfc_grid_model.best_params_
best_rfc_model

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
105 fits failed out of a total of 420.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
105 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/emanoelagbayani/miniconda3/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/emanoelagbayani/miniconda3/lib/python3.13/site-packages/sklearn/base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/emanoelagbayani/miniconda3/lib/python3.13/site-

{'bootstrap': False,
 'max_features': None,
 'n_estimators': 20,
 'oob_score': False}

In [117]:
best_rfc_model = RandomForestClassifier(random_state=42,bootstrap=False,max_features=None,n_estimators=20,oob_score=False)
best_rfc_model.fit(scaled_X_train, y_train)
rfc_test_predictions=best_rfc_model.predict(scaled_X_test)
accuracy_score(y_test, rfc_test_predictions)

0.8177339901477833

## KNN

In [118]:
knn_grid_model = GridSearchCV(knn_model, knn_param_grid)
knn_grid_model.fit(scaled_X_train,y_train)
best_knn_model = knn_grid_model.best_params_
best_knn_model

{'algorithm': 'auto', 'n_neighbors': 15, 'weights': 'distance'}

In [119]:
best_knn_model = KNeighborsClassifier(algorithm='auto',n_neighbors=15,weights='distance')
best_knn_model.fit(scaled_X_train, y_train)
knn_test_predictions=best_knn_model.predict(scaled_X_test)
accuracy_score(y_test, knn_test_predictions)

0.8078817733990148

In [122]:
random_states = [42, 42, 101]
knn_accuracies = []

for rs in random_states:
    print(f"\n--- Trial with random_state = {rs} ---")

    # Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rs)

    # Scaling
    scaler = StandardScaler()
    scaled_X_train = scaler.fit_transform(X_train)
    scaled_X_test = scaler.transform(X_test)

    # Grid Search
    knn_grid_model = GridSearchCV(knn_model, knn_param_grid)
    knn_grid_model.fit(scaled_X_train, y_train)

    best_knn_params = knn_grid_model.best_params_
    print("Best params:", best_knn_params)

    # Final KNN model (NO random_state)
    best_knn_model = KNeighborsClassifier(**best_knn_params)
    best_knn_model.fit(scaled_X_train, y_train)

    # Predictions
    preds = best_knn_model.predict(scaled_X_test)
    acc = accuracy_score(y_test, preds)
    knn_accuracies.append(acc)

    print(f"Accuracy: {acc:.4f}")

print("\nAll accuracies:", knn_accuracies)
print("Average accuracy:", sum(knn_accuracies) / len(knn_accuracies))


--- Trial with random_state = 42 ---
Best params: {'algorithm': 'auto', 'n_neighbors': 15, 'weights': 'distance'}
Accuracy: 0.8079

--- Trial with random_state = 42 ---
Best params: {'algorithm': 'auto', 'n_neighbors': 15, 'weights': 'distance'}
Accuracy: 0.8079

--- Trial with random_state = 101 ---
Best params: {'algorithm': 'auto', 'n_neighbors': 15, 'weights': 'distance'}
Accuracy: 0.8522

All accuracies: [0.8078817733990148, 0.8078817733990148, 0.8522167487684729]
Average accuracy: 0.8226600985221676


## Gradient Boosted Trees

In [120]:
gb_grid_model = GridSearchCV(gb_model, gb_param_grid)
gb_grid_model.fit(scaled_X_train,y_train)
best_gb_model = gb_grid_model.best_params_
best_gb_model

KeyboardInterrupt: 

In [None]:
best_gb_model = GradientBoostingClassifier(random_state=42,learning_rate=0.01,max_depth=4,n_estimators=100)
best_gb_model.fit(scaled_X_train, y_train)
gb_test_predictions=best_gb_model.predict(scaled_X_test)
accuracy_score(y_test, gb_test_predictions)

In [126]:
gb_model

0,1,2
,loss,'log_loss'
,learning_rate,0.1
,n_estimators,100
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0


In [127]:
random_states = [41, 42, 101]
gb_accuracies = []

for rs in random_states:
    print(f"\n--- Trial with random_state = {rs} ---")

    # 1. Train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rs)

    # 2. Scaling
    scaler = StandardScaler()
    scaled_X_train = scaler.fit_transform(X_train)
    scaled_X_test = scaler.transform(X_test)

    # 3. Grid search
    gb_grid_model = GridSearchCV(gb_model, gb_param_grid)
    gb_grid_model.fit(scaled_X_train, y_train)

    best_gb_params = gb_grid_model.best_params_
    print("Best params:", best_gb_params)

    # 4. Final model using best params
    best_gb_model = GradientBoostingClassifier(random_state=rs,**best_gb_params)
    best_gb_model.fit(scaled_X_train, y_train)

    # 5. Evaluate
    preds = best_gb_model.predict(scaled_X_test)
    acc = accuracy_score(y_test, preds)
    gb_accuracies.append(acc)

    print(f"Accuracy: {acc:.4f}")

print("\nAll accuracies:", gb_accuracies)
print("Average Gradient Boosting accuracy:", sum(gb_accuracies)/len(gb_accuracies))


--- Trial with random_state = 41 ---
Best params: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 1000}
Accuracy: 0.8473

--- Trial with random_state = 42 ---


KeyboardInterrupt: 

# 50/50

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)
scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

## Logistic Regression

In [None]:
lr_grid_model = GridSearchCV(log_model,param_grid=lr_param_grid)
lr_grid_model.fit(scaled_X_train,y_train)
best_lr_model = lr_grid_model.best_params_
best_lr_model

In [None]:
best_lr_model = LogisticRegression(random_state=42,C=0.01,max_iter=100,solver='saga')
best_lr_model.fit(scaled_X_train, y_train)
lr_test_predictions = best_lr_model.predict(scaled_X_test)
accuracy_score(y_test, lr_test_predictions)

In [128]:
random_states = [41, 42, 101]
lr_accuracies = []

for rs in random_states:
    print(f"\n--- Trial with random_state = {rs} ---")

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=rs)
    scaler = StandardScaler()
    scaled_X_train = scaler.fit_transform(X_train)
    scaled_X_test = scaler.transform(X_test)
    
    lr_grid_model = GridSearchCV(log_model, param_grid=lr_param_grid)
    lr_grid_model.fit(scaled_X_train, y_train)

    best_lr_params = lr_grid_model.best_params_
    print("Best params:", best_lr_params)
    best_lr_model = LogisticRegression(random_state=rs,**best_lr_params)
    best_lr_model.fit(scaled_X_train, y_train)
    preds = best_lr_model.predict(scaled_X_test)
    acc = accuracy_score(y_test, preds)
    lr_accuracies.append(acc)

    print(f"Accuracy: {acc:.4f}")

print("\nAll accuracies:", lr_accuracies)
print("Average Logistic Regression accuracy:", sum(lr_accuracies)/len(lr_accuracies))


--- Trial with random_state = 41 ---


2970 fits failed out of a total of 6480.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
270 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/emanoelagbayani/miniconda3/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/emanoelagbayani/miniconda3/lib/python3.13/site-packages/sklearn/base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/emanoelagbayani/miniconda3/lib/python3.13/site-packages/sklearn/linear_model/_logistic.py", line 1218, in fit
    solver = _check_solver(self.solver, self.penalty, s

Best params: {'C': 0.1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy: 0.6529

--- Trial with random_state = 42 ---


2970 fits failed out of a total of 6480.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
270 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/emanoelagbayani/miniconda3/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/emanoelagbayani/miniconda3/lib/python3.13/site-packages/sklearn/base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/emanoelagbayani/miniconda3/lib/python3.13/site-packages/sklearn/linear_model/_logistic.py", line 1218, in fit
    solver = _check_solver(self.solver, self.penalty, s

Best params: {'C': 0.1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy: 0.6627

--- Trial with random_state = 101 ---




Best params: {'C': 0.1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy: 0.6607

All accuracies: [0.6528599605522682, 0.6627218934911243, 0.6607495069033531]
Average Logistic Regression accuracy: 0.6587771203155819


2970 fits failed out of a total of 6480.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
270 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/emanoelagbayani/miniconda3/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/emanoelagbayani/miniconda3/lib/python3.13/site-packages/sklearn/base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/emanoelagbayani/miniconda3/lib/python3.13/site-packages/sklearn/linear_model/_logistic.py", line 1218, in fit
    solver = _check_solver(self.solver, self.penalty, s

In [130]:
print("\nAll accuracies:", lr_accuracies)
print("Average Logistic Regression accuracy:", sum(lr_accuracies)/len(lr_accuracies))


All accuracies: [0.6528599605522682, 0.6627218934911243, 0.6607495069033531]
Average Logistic Regression accuracy: 0.6587771203155819


## SVM

In [131]:
random_states = [41, 42, 101]
svc_accuracies = []

for rs in random_states:
    print(f"\n--- Trial with random_state = {rs} ---")

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=rs)
    scaler = StandardScaler()
    scaled_X_train = scaler.fit_transform(X_train)
    scaled_X_test = scaler.transform(X_test)
    
    svc_grid_model = GridSearchCV(svc_model, param_grid=svc_param_grid)
    svc_grid_model.fit(scaled_X_train, y_train)
    best_svc_params = svc_grid_model.best_params_
    print("Best params:", best_svc_params)

    if best_svc_params.get("kernel") in ["poly", "sigmoid"]:
        best_svc_model = SVC(random_state=rs, **best_svc_params)
    else:
        best_svc_model = SVC(**best_svc_params)

    best_svc_model.fit(scaled_X_train, y_train)
    preds = best_svc_model.predict(scaled_X_test)
    acc = accuracy_score(y_test, preds)
    svc_accuracies.append(acc)

    print(f"Accuracy: {acc:.4f}")

print("\nAll accuracies:", svc_accuracies)
print("Average SVM accuracy:", sum(svc_accuracies)/len(svc_accuracies))


--- Trial with random_state = 41 ---
Best params: {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}
Accuracy: 0.7140

--- Trial with random_state = 42 ---
Best params: {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}
Accuracy: 0.6884

--- Trial with random_state = 101 ---
Best params: {'C': 0.5, 'gamma': 'scale', 'kernel': 'rbf'}
Accuracy: 0.6903

All accuracies: [0.7140039447731755, 0.6883629191321499, 0.6903353057199211]
Average SVM accuracy: 0.6975673898750822


## RFC

In [133]:
random_states = [41, 42, 101]
rfc_accuracies = []

for rs in random_states:
    print(f"\n--- Trial with random_state = {rs} ---")

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=rs)
    scaler = StandardScaler()
    scaled_X_train = scaler.fit_transform(X_train)
    scaled_X_test = scaler.transform(X_test)

    rfc_grid_model = GridSearchCV(rfc_model, rfc_param_grid)
    rfc_grid_model.fit(scaled_X_train, y_train)

    best_rfc_params = rfc_grid_model.best_params_
    print("Best params:", best_rfc_params)

    best_rfc_model = RandomForestClassifier(random_state=rs,**best_rfc_params)
    best_rfc_model.fit(scaled_X_train, y_train)
    preds = best_rfc_model.predict(scaled_X_test)
    acc = accuracy_score(y_test, preds)
    rfc_accuracies.append(acc)

    print(f"Accuracy: {acc:.4f}")

print("\nAll accuracies:", rfc_accuracies)
print("Average Random Forest accuracy:", sum(rfc_accuracies)/len(rfc_accuracies))


--- Trial with random_state = 41 ---


105 fits failed out of a total of 420.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
105 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/emanoelagbayani/miniconda3/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/emanoelagbayani/miniconda3/lib/python3.13/site-packages/sklearn/base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/emanoelagbayani/miniconda3/lib/python3.13/site-packages/sklearn/ensemble/_forest.py", line 447, in fit
    raise ValueError("Out of bag estimation only available if bo

Best params: {'bootstrap': True, 'max_features': 'sqrt', 'n_estimators': 1000, 'oob_score': True}
Accuracy: 0.8028

--- Trial with random_state = 42 ---


105 fits failed out of a total of 420.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
105 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/emanoelagbayani/miniconda3/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/emanoelagbayani/miniconda3/lib/python3.13/site-packages/sklearn/base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/emanoelagbayani/miniconda3/lib/python3.13/site-packages/sklearn/ensemble/_forest.py", line 447, in fit
    raise ValueError("Out of bag estimation only available if bo

Best params: {'bootstrap': True, 'max_features': None, 'n_estimators': 200, 'oob_score': True}
Accuracy: 0.7909

--- Trial with random_state = 101 ---
Best params: {'bootstrap': True, 'max_features': 'sqrt', 'n_estimators': 20, 'oob_score': True}
Accuracy: 0.7909

All accuracies: [0.8027613412228797, 0.7909270216962525, 0.7909270216962525]
Average Random Forest accuracy: 0.7948717948717948


105 fits failed out of a total of 420.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
105 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/emanoelagbayani/miniconda3/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/emanoelagbayani/miniconda3/lib/python3.13/site-packages/sklearn/base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/emanoelagbayani/miniconda3/lib/python3.13/site-packages/sklearn/ensemble/_forest.py", line 447, in fit
    raise ValueError("Out of bag estimation only available if bo

In [None]:
print("\nAll accuracies:", rfc_accuracies)
print("Average Random Forest accuracy:", sum(rfc_accuracies)/len(rfc_accuracies))

## KNN

In [134]:
random_states = [42, 42, 101]
knn_accuracies = []

for rs in random_states:
    print(f"\n--- Trial with random_state = {rs} ---")

    # Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=rs)

    # Scaling
    scaler = StandardScaler()
    scaled_X_train = scaler.fit_transform(X_train)
    scaled_X_test = scaler.transform(X_test)

    # Grid Search
    knn_grid_model = GridSearchCV(knn_model, knn_param_grid)
    knn_grid_model.fit(scaled_X_train, y_train)

    best_knn_params = knn_grid_model.best_params_
    print("Best params:", best_knn_params)

    # Final KNN model (NO random_state)
    best_knn_model = KNeighborsClassifier(**best_knn_params)
    best_knn_model.fit(scaled_X_train, y_train)

    # Predictions
    preds = best_knn_model.predict(scaled_X_test)
    acc = accuracy_score(y_test, preds)
    knn_accuracies.append(acc)

    print(f"Accuracy: {acc:.4f}")

print("\nAll accuracies:", knn_accuracies)
print("Average accuracy:", sum(knn_accuracies) / len(knn_accuracies))


--- Trial with random_state = 42 ---
Best params: {'algorithm': 'auto', 'n_neighbors': 15, 'weights': 'distance'}
Accuracy: 0.7653

--- Trial with random_state = 42 ---
Best params: {'algorithm': 'auto', 'n_neighbors': 15, 'weights': 'distance'}
Accuracy: 0.7653

--- Trial with random_state = 101 ---
Best params: {'algorithm': 'auto', 'n_neighbors': 15, 'weights': 'distance'}
Accuracy: 0.7890

All accuracies: [0.7652859960552268, 0.7652859960552268, 0.7889546351084813]
Average accuracy: 0.7731755424063117


## Gradient Boosted Trees


In [135]:
random_states = [41, 42, 101]
gb_accuracies = []

for rs in random_states:
    print(f"\n--- Trial with random_state = {rs} ---")

    # 1. Train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=rs)

    # 2. Scaling
    scaler = StandardScaler()
    scaled_X_train = scaler.fit_transform(X_train)
    scaled_X_test = scaler.transform(X_test)

    # 3. Grid search
    gb_grid_model = GridSearchCV(gb_model, gb_param_grid)
    gb_grid_model.fit(scaled_X_train, y_train)

    best_gb_params = gb_grid_model.best_params_
    print("Best params:", best_gb_params)

    # 4. Final model using best params
    best_gb_model = GradientBoostingClassifier(random_state=rs,**best_gb_params)
    best_gb_model.fit(scaled_X_train, y_train)

    # 5. Evaluate
    preds = best_gb_model.predict(scaled_X_test)
    acc = accuracy_score(y_test, preds)
    gb_accuracies.append(acc)

    print(f"Accuracy: {acc:.4f}")

print("\nAll accuracies:", gb_accuracies)
print("Average Gradient Boosting accuracy:", sum(gb_accuracies)/len(gb_accuracies))


--- Trial with random_state = 41 ---


KeyboardInterrupt: 

# Doctor Visits Prediction

In [136]:
national_poll_on_healthy_aging = fetch_ucirepo(id=936) 
x = national_poll_on_healthy_aging.data.features 
y = national_poll_on_healthy_aging.data.targets 

In [137]:
doctor_df = pd.concat([x, y], axis=1)
doctor_df.head()

Unnamed: 0,Age,Physical_Health,Mental_Health,Dental_Health,Employment,Stress_Keeps_Patient_from_Sleeping,Medication_Keeps_Patient_from_Sleeping,Pain_Keeps_Patient_from_Sleeping,Bathroom_Needs_Keeps_Patient_from_Sleeping,Uknown_Keeps_Patient_from_Sleeping,Trouble_Sleeping,Prescription_Sleep_Medication,Race,Gender,Number_of_Doctors_Visited
0,2,4,3,3,3,0,0,0,0,1,2,3,1,2,3
1,2,4,2,3,3,1,0,0,1,0,3,3,1,1,2
2,2,3,2,3,3,0,0,0,0,1,3,3,4,1,3
3,2,3,2,3,3,0,0,0,1,0,3,3,4,2,1
4,2,3,3,3,3,1,0,0,0,0,2,3,1,2,3


In [138]:
doctor_df.shape

(714, 15)

In [139]:
doctor_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 714 entries, 0 to 713
Data columns (total 15 columns):
 #   Column                                      Non-Null Count  Dtype
---  ------                                      --------------  -----
 0   Age                                         714 non-null    int64
 1   Physical_Health                             714 non-null    int64
 2   Mental_Health                               714 non-null    int64
 3   Dental_Health                               714 non-null    int64
 4   Employment                                  714 non-null    int64
 5   Stress_Keeps_Patient_from_Sleeping          714 non-null    int64
 6   Medication_Keeps_Patient_from_Sleeping      714 non-null    int64
 7   Pain_Keeps_Patient_from_Sleeping            714 non-null    int64
 8   Bathroom_Needs_Keeps_Patient_from_Sleeping  714 non-null    int64
 9   Uknown_Keeps_Patient_from_Sleeping          714 non-null    int64
 10  Trouble_Sleeping                      

In [140]:
doctor_df.isna().sum()

Age                                           0
Physical_Health                               0
Mental_Health                                 0
Dental_Health                                 0
Employment                                    0
Stress_Keeps_Patient_from_Sleeping            0
Medication_Keeps_Patient_from_Sleeping        0
Pain_Keeps_Patient_from_Sleeping              0
Bathroom_Needs_Keeps_Patient_from_Sleeping    0
Uknown_Keeps_Patient_from_Sleeping            0
Trouble_Sleeping                              0
Prescription_Sleep_Medication                 0
Race                                          0
Gender                                        0
Number_of_Doctors_Visited                     0
dtype: int64

In [144]:
X = doctor_df.drop(columns='Number_of_Doctors_Visited')
y = doctor_df['Number_of_Doctors_Visited']

# 70/15/15

In [146]:
import warnings
warnings.filterwarnings('ignore')

In [148]:
random_states = [41, 42, 101]
lr_accuracies = []

for rs in random_states:
    print(f"\n--- Trial with random_state = {rs} ---")

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=rs)
    X_validation, X_holdout_test, y_validation, y_holdout_test = train_test_split(X_test, y_test, test_size=0.5, random_state=rs)
    scaler = StandardScaler()
    scaled_X_train = scaler.fit_transform(X_train)
    scaled_X_test = scaler.transform(X_test)
    scaled_X_holdout_test = scaler.transform(X_holdout_test)
    scaled_X_validation = scaler.fit_transform(X_validation)
    
    lr_grid_model = GridSearchCV(log_model, param_grid=lr_param_grid)
    lr_grid_model.fit(scaled_X_train, y_train)

    best_lr_params = lr_grid_model.best_params_
    print("Best params:", best_lr_params)
    best_lr_model = LogisticRegression(random_state=rs,**best_lr_params)
    best_lr_model.fit(scaled_X_train, y_train)
    preds = best_lr_model.predict(scaled_X_test)

    lr_validation_predictions = best_lr_model.predict(scaled_X_validation)
    acc1 = accuracy_score(y_validation, lr_validation_predictions)
    lr_accuracies.append(acc1)


    lr_holdout_predictions = best_lr_model.predict(scaled_X_holdout_test)
    acc2 = accuracy_score(y_holdout_test, lr_holdout_predictions)
    lr_accuracies.append(acc2)

    print(f"Accuracy: {acc1:.4f}")
    print(f"Accuracy: {acc2:.4f}")

print("\nAll accuracies:", lr_accuracies)
print("Average Logistic Regression accuracy:", sum(lr_accuracies)/len(lr_accuracies))


--- Trial with random_state = 41 ---
Best params: {'C': 0.05, 'max_iter': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
Accuracy: 0.5047
Accuracy: 0.5185

--- Trial with random_state = 42 ---
Best params: {'C': 0.001, 'max_iter': 100, 'penalty': 'l1', 'solver': 'saga'}
Accuracy: 0.5794
Accuracy: 0.4537

--- Trial with random_state = 101 ---
Best params: {'C': 0.01, 'max_iter': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
Accuracy: 0.5234
Accuracy: 0.5093

All accuracies: [0.5046728971962616, 0.5185185185185185, 0.5794392523364486, 0.4537037037037037, 0.5233644859813084, 0.5092592592592593]
Average Logistic Regression accuracy: 0.5148263528325834


In [None]:
#Cirrhosis

In [None]:
cirrhosis_patient_survival = fetch_ucirepo(id=878) 
x = cirrhosis_patient_survival.data.features 
y = cirrhosis_patient_survival.data.targets 

In [None]:
cirrhosis_df = pd.concat([x, y], axis=1)
cirrhosis_df.head()

In [None]:
cirrhosis_df.shape

In [None]:
risk_factor_prediction_of = fetch_ucirepo(id=857)
x = risk_factor_prediction_of.data.features 
y = risk_factor_prediction_of.data.targets 

In [None]:
kidney_df = pd.concat([x, y], axis=1)
kidney_df.head()

In [None]:
kidney_df.columns

In [None]:
kidney_df['pot']

In [None]:
kidney_df['hemo']