In [1]:
import pandas as pd
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt
import os

In [2]:
print(os.getcwd())

/Users/emanoelagbayani/Desktop/Predictive-Analysis-of-Hairloss/PredictiveHairLoss


In [3]:
df = pd.read_csv("Predict Hair Fall.csv")
df.head()

Unnamed: 0,Id,Genetics,Hormonal Changes,Medical Conditions,Medications & Treatments,Nutritional Deficiencies,Stress,Age,Poor Hair Care Habits,Environmental Factors,Smoking,Weight Loss,Hair Loss
0,133992,Yes,No,No Data,No Data,Magnesium deficiency,Moderate,19,Yes,Yes,No,No,0
1,148393,No,No,Eczema,Antibiotics,Magnesium deficiency,High,43,Yes,Yes,No,No,0
2,155074,No,No,Dermatosis,Antifungal Cream,Protein deficiency,Moderate,26,Yes,Yes,No,Yes,0
3,118261,Yes,Yes,Ringworm,Antibiotics,Biotin Deficiency,Moderate,46,Yes,Yes,No,No,0
4,111915,No,No,Psoriasis,Accutane,Iron deficiency,Moderate,30,No,Yes,Yes,No,1


In [None]:
df['Medical Conditions'].unique()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
#Basic EDA

In [None]:
sns.histplot(data=df, x=df['Age'],discrete=True)
#shows how the ages and a count of each age

In [None]:
sns.histplot(data=df, y='Medical Conditions')
#shows us the count of each medication condition

In [None]:
sns.histplot(data=df, y='Stress')
#shows us the moderate levels of stress

In [None]:
df.columns

In [None]:
medical_conditions_count = df.groupby('Age')['Medical Conditions'].count().reset_index(name='Medical Conditions')

plt.figure(dpi=150)
sns.barplot(data = medical_conditions_count,
            x='Age',y='Medical Conditions')
plt.xticks(rotation=90);

In [None]:
df[df['Age']==18]['Medical Conditions'].value_counts()

In [None]:
sns.barplot(data=df,x='Age',y='Stress',hue='Hair Loss',
           palette='viridis')
plt.legend(loc=(1.05,.75))

In [None]:
hormonal_changes_count = df.groupby('Age')['Hormonal Changes'].count().reset_index(name='Hormonal Changes')
sns.barplot(data=hormonal_changes_count, x='Age', y='Hormonal Changes')
plt.xticks(rotation=90);
#there is a surprising amount of hormonal changes for 

In [None]:
df['Medical Conditions'].unique()

In [None]:
df.isna().sum() > 0

In [None]:
#so we have no null values in the whole data frame, but we do have both categoricial and numeric data, we will need to get dummy variables
df.columns
df['Smoking'].unique()

In [None]:
df['Age'].sort_values().unique()

In [None]:
list(df.columns)

In [None]:
#separating the df columns into categorical and numeric
my_object_df = df.select_dtypes(include='object')
my_numeric_df = df.select_dtypes(exclude='object')

In [None]:
my_object_df

In [None]:
my_numeric_df

In [None]:
#getting dummy variables from the categorical column
df_objects_dummies = pd.get_dummies(my_object_df, drop_first=True)
df_objects_dummies.head()

In [None]:
#b/c we are predicting a category with label'd data

In [None]:
final_df = pd.concat([my_numeric_df,df_objects_dummies], axis=1)
final_df

In [None]:
#but first lets train test split and validate our machine 
final_df = final_df.drop('Id', axis=1)
final_df

In [None]:
X = final_df.drop('Hair Loss',axis =1)
y = final_df['Hair Loss']

In [None]:
X.head()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
#train_test_split
#grab the tuple unpacking and because we are testing and validating our data will be in a 70 15 15, with seed/state of 101
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [None]:
X_validation, X_holdout_test, y_validation, y_holdout_test = train_test_split(X_test, y_test, test_size=0.5, random_state=101)

In [None]:
#to make sure everything is right, we can check the size of everything
len(df)

In [None]:
len(X_train)

In [None]:
len(X_validation)

In [None]:
len(X_holdout_test)

In [None]:
#we can see everything adds up to 999 (length of original df) and if not we did something wrong

In [None]:
#just to make sure, we will scale our data as well
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)
scaled_X_holdout_test = scaler.transform(X_holdout_test)
scaled_X_validation = scaler.fit_transform(X_validation)

In [None]:
from sklearn.svm import SVC

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
svc_model = SVC(random_state=101)

In [None]:
kernel = ['linear', 'poly', 'rbf','sigmoid']

In [None]:
gamma = [0.001, 0.01, 0.1, 1, 10]

In [None]:
C = [0.01, 0.1, 1, 10, 100]

In [None]:
param_grid = {'kernel': kernel,
              'C': C,
             'gamma': gamma}

In [None]:
svc_grid_model = GridSearchCV(svc_model,param_grid=param_grid)

In [None]:
svc_grid_model.fit(scaled_X_train, y_train)

In [None]:
best_svc_model = svc_grid_model.best_params_
best_svc_model

In [None]:
best_svc_model = SVC(random_state=101,C=0.1,gamma=10, kernel='sigmoid')

In [None]:
best_svc_model.fit(scaled_X_train,y_train)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [None]:
#we will begin with validation
svc_validation_predictions = best_svc_model.predict(scaled_X_validation)
#validation_predictions

In [None]:
svc_validate_mae = mean_absolute_error(y_validation, svc_validation_predictions) #compare to mean value
svc_validate_mae

In [None]:
svc_validate_rmse = mean_squared_error(y_validation, svc_validation_predictions) ** 0.5 #RMSE #compare to STD
svc_validate_rmse

In [None]:
#as a baseline we have MAE: 0.4 and RMSE =0.63
#now we will being using testing
svc_holdout_predictions = best_svc_model.predict(scaled_X_holdout_test)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, plot_confusion_matrix

In [None]:
accuracy_score(y_holdout_test, svc_holdout_predictions)
#initial accuracy score for holdout

In [None]:
svc_holdout_mae = mean_absolute_error(y_holdout_test, svc_holdout_predictions) 
svc_holdout_mae

In [None]:
svc_holdout_rmse = mean_squared_error(y_holdout_test, svc_holdout_predictions) ** 0.5 
svc_holdout_rmse

In [None]:
svc_y_final_pred = best_svc_model.predict(scaled_X_test)
#svc_y_final_pred

In [None]:
accuracy_score(y_test,svc_y_final_pred)

In [None]:
print(classification_report(y_test,svc_y_final_pred))

In [None]:
svc_report = classification_report(y_test,svc_y_final_pred,output_dict=True)
svc_report

In [None]:
#that's with svc, lets see how it compares to logistic regression 

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
log_model = LogisticRegression()

In [None]:
penalty = ['l1','l2','elasticnet']

In [None]:
C = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 10, 100]

In [None]:
max_iter = [100,500,1000, 1500, 2500, 3000]

In [None]:
param_grid = {'penalty': penalty,
              'C' : C,
              'max_iter': max_iter}

In [None]:
lr_grid_model = GridSearchCV(log_model,param_grid=param_grid)

In [None]:
lr_grid_model.fit(scaled_X_train,y_train)

In [None]:
lr_grid_model.best_params_

In [None]:
best_lr_model = LogisticRegression(C=0.001, max_iter=100, penalty='l2')
best_lr_model

In [None]:
best_lr_model.fit(scaled_X_train,y_train)

In [None]:
lr_validation_predictions = best_lr_model.predict(scaled_X_validation)

In [None]:
lr_validate_mae = mean_absolute_error(y_validation, lr_validation_predictions) #compare to mean value
lr_validate_mae

In [None]:
lr_validate_rmse = mean_squared_error(y_validation, lr_validation_predictions) ** 0.5 
lr_validate_rmse

In [None]:
#since we have a baseline of mae=0.5 and rmse=0.7, let's compare it to the holdout test
lr_holdout_predictions = best_lr_model.predict(scaled_X_holdout_test)

In [None]:
accuracy_score(y_holdout_test,lr_holdout_predictions)

In [None]:
lr_holdout_mae = mean_absolute_error(y_holdout_test, lr_holdout_predictions)
lr_holdout_mae

In [None]:
lr_holdout_rmse = mean_squared_error(y_holdout_test, lr_holdout_predictions) **0.5
lr_holdout_rmse

In [None]:
lr_y_final_pred = best_svc_model.predict(scaled_X_test)

In [None]:
accuracy_score(y_test, lr_y_final_pred)

In [None]:
print(classification_report(y_test,lr_y_final_pred))

In [None]:
lr_report = classification_report(y_test,lr_y_final_pred,output_dict=True)

In [None]:
#now lets try a random forest
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc_model = RandomForestClassifier()

In [None]:
n_estimators = [100, 200, 500, 1000, 1500, 2000, 2500]

In [None]:
max_features = ['sqrt', 'log2', None, 2, 5, 10]

In [None]:
bootstrap = [True, False]

In [None]:
oob_score = [True, False]

In [None]:
param_grid = {'n_estimators': n_estimators,
              'max_features': max_features,
              'bootstrap': bootstrap,
              'oob_score': oob_score}

In [None]:
rfc_grid_model = GridSearchCV(rfc_model, param_grid)

In [None]:
rfc_grid_model.fit(scaled_X_train,y_train)

In [None]:
best_rfc_model = rfc_grid_model.best_params_
best_rfc_model

In [None]:
best_rfc_model = RandomForestClassifier(bootstrap=False ,max_features=None,
                                        n_estimators=1000, oob_score=False)

In [None]:
best_rfc_model.fit(scaled_X_train,y_train)

In [None]:
rfc_validation_predictions = best_rfc_model.predict(scaled_X_validation)

In [None]:
#gathering baseline stats to compare to holdout test stats
rfc_validate_mae = mean_absolute_error(y_validation, rfc_validation_predictions)
rfc_validate_mae

In [None]:
rfc_validate_rmse = mean_squared_error(y_validation, rfc_validation_predictions) **0.5
rfc_validate_rmse

In [None]:
rfc_holdout_predictions = best_rfc_model.predict(scaled_X_holdout_test)

In [None]:
accuracy_score(y_holdout_test, rfc_holdout_predictions)

In [None]:
rfc_holdout_mae = mean_absolute_error(y_holdout_test, rfc_validation_predictions)
rfc_holdout_mae

In [None]:
rfc_holdout_rmse = mean_squared_error(y_holdout_test, rfc_validation_predictions) **0.5
rfc_holdout_rmse

In [None]:
rfc_y_final_pred = best_rfc_model.predict(scaled_X_test)

In [None]:
accuracy_score(y_test,rfc_y_final_pred)

In [None]:
print(classification_report(y_test,rfc_y_final_pred))

In [None]:
rfc_report = classification_report(y_test,rfc_y_final_pred,output_dict=True)

In [None]:
from sklearn.naive_bayes import CategoricalNB, BernoulliNB

In [None]:
nb_model = BernoulliNB()

In [None]:
alpha = [0.001,0.01, 0.1,0.2,0.5,0.7,1,2,5,10]

In [None]:
fit_prior = [True, False]

In [None]:
binarize = [None, 0.0, 0.5, 1.0]

In [None]:
param_grid = {'alpha': alpha,
             'fit_prior': fit_prior,
             'binarize': binarize}

In [None]:
nb_grid_model = GridSearchCV(nb_model, param_grid)

In [None]:
nb_grid_model.fit(scaled_X_train,y_train)

In [None]:
best_nb_model = nb_grid_model.best_params_
best_nb_model

In [None]:
best_nb_model = BernoulliNB(alpha = 0.001, binarize=None,fit_prior = True)

In [None]:
best_nb_model = nb_grid_model.fit(scaled_X_train,y_train)

In [None]:
nb_validation_predictions = best_nb_model.predict(X_validation)
nb_validation_predictions

In [None]:
nb_validate_mae = mean_absolute_error(y_validation, nb_validation_predictions)
nb_validate_mae

In [None]:
nb_validate_rmse = mean_squared_error(y_validation, nb_validation_predictions) ** 0.5
nb_validate_rmse

In [None]:
nb_holdout_predictions = best_nb_model.predict(scaled_X_holdout_test)

In [None]:
accuracy_score(y_holdout_test, nb_holdout_predictions)

In [None]:
svc_holdout_mae = mean_absolute_error(y_holdout_test, nb_holdout_predictions)
svc_holdout_mae

In [None]:
nb_holdout_rmse = mean_squared_error(y_holdout_test, nb_holdout_predictions) ** 0.5
nb_holdout_rmse

In [None]:
nb_y_final_pred = best_nb_model.predict(scaled_X_test)

In [None]:
accuracy_score(y_test, nb_y_final_pred)

In [None]:
print(classification_report(y_test,nb_y_final_pred))

In [None]:
nb_report = classification_report(y_test,nb_y_final_pred,output_dict=True)

In [None]:
#let's try gradient and adaboosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
param_grid = {'n_estimators' : [50,100,500,1000,1500,2000],
             'learning_rate' : [0.01, 0.05, 0.1],
             'max_depth' : [3,4,5,10]}

In [None]:
gb_model = GradientBoostingClassifier()

In [None]:
gb_grid_model = GridSearchCV(gb_model, param_grid)

In [None]:
gb_grid_model.fit(scaled_X_train,y_train)

In [None]:
best_gb_model = gb_grid_model.best_params_
best_gb_model

In [None]:
best_gb_model = GradientBoostingClassifier(learning_rate=0.05,
                                          max_depth=5,n_estimators=2000)

In [None]:
best_gb_model

In [None]:
best_gb_model.fit(scaled_X_train,y_train)

In [None]:
gb_validation_predictions = best_gb_model.predict(X_validation)


In [None]:
gb_validate_mae = mean_absolute_error(y_validation, gb_validation_predictions)
gb_validate_mae

In [None]:
gb_validate_rmse = mean_squared_error(y_validation, gb_validation_predictions) ** 0.5
gb_validate_rmse

In [None]:
gb_holdout_predictions = best_gb_model.predict(scaled_X_holdout_test)

In [None]:
accuracy_score(y_holdout_test, gb_holdout_predictions)

In [None]:
gb_holdout_mae = mean_absolute_error(y_holdout_test, gb_holdout_predictions)
gb_holdout_mae

In [None]:
gb_holdout_rmse = mean_squared_error(y_holdout_test, gb_holdout_predictions) ** 0.5
gb_holdout_rmse

In [None]:
gb_y_final_pred = best_gb_model.predict(scaled_X_test)

In [None]:
accuracy_score(y_test,gb_y_final_pred)

In [None]:
print(classification_report(y_test,gb_y_final_pred)

In [None]:
gb_report = classification_report(y_test,gb_y_final_pred,output_dict=True)

In [None]:
#How to choose the best model? 
#Precision for false positives
#recall for false negatives 
#highest accuracy? 
#best f1 score?

In [None]:
def get_best_model(models_metrics_list):

    best_metrics = {
        'highest_accuracy': {'value': 0, 'model_index': None, 'model': None},
        'highest_macro_f1': {'value': 0, 'model_index': None, 'model': None},
        'highest_weighted_f1': {'value': 0, 'model_index': None, 'model': None},
        'highest_macro_precision': {'value': 0, 'model_index': None, 'model': None},
        'highest_macro_recall': {'value': 0, 'model_index': None, 'model': None}
    }

    # Iterate through the models and update the best metrics
    for index, model in enumerate(models_metrics_list):
        accuracy = model.get('accuracy', 0)
        macro_avg = model.get('macro avg', {})
        weighted_avg = model.get('weighted avg', {})

        # Update best metrics for accuracy, macro avg, and weighted avg
        if accuracy > best_metrics['highest_accuracy']['value']:
            best_metrics['highest_accuracy'] = {'value': accuracy, 'model_index': index, 'model': model}
        if macro_avg.get('f1-score', 0) > best_metrics['highest_macro_f1']['value']:
            best_metrics['highest_macro_f1'] = {'value': macro_avg.get('f1-score', 0), 'model_index': index, 'model': model}
        if weighted_avg.get('f1-score', 0) > best_metrics['highest_weighted_f1']['value']:
            best_metrics['highest_weighted_f1'] = {'value': weighted_avg.get('f1-score', 0), 'model_index': index, 'model': model}
        if macro_avg.get('precision', 0) > best_metrics['highest_macro_precision']['value']:
            best_metrics['highest_macro_precision'] = {'value': macro_avg.get('precision', 0), 'model_index': index, 'model': model}
        if macro_avg.get('recall', 0) > best_metrics['highest_macro_recall']['value']:
            best_metrics['highest_macro_recall'] = {'value': macro_avg.get('recall', 0), 'model_index': index, 'model': model}

    return best_metrics
    

In [None]:
def get_best_model_from_list(models_metrics_list):

    best_metrics = {
        'highest_accuracy': {'value': 0, 'model_index': None, 'model': None},
        'highest_macro_f1': {'value': 0, 'model_index': None, 'model': None},
        'highest_weighted_f1': {'value': 0, 'model_index': None, 'model': None},
        'highest_macro_precision': {'value': 0, 'model_index': None, 'model': None},
        'highest_macro_recall': {'value': 0, 'model_index': None, 'model': None}
    }

    for index, model in enumerate(models_metrics_list):
        accuracy = model.get('accuracy', 0)
        macro_avg = model.get('macro avg', {})
        weighted_avg = model.get('weighted avg', {})

        if accuracy > best_metrics['highest_accuracy']['value']:
            best_metrics['highest_accuracy'] = {'value': accuracy, 'model_index': index, 'model': model}
        if macro_avg.get('f1-score', 0) > best_metrics['highest_macro_f1']['value']:
            best_metrics['highest_macro_f1'] = {'value': macro_avg.get('f1-score', 0), 'model_index': index, 'model': model}
        if weighted_avg.get('f1-score', 0) > best_metrics['highest_weighted_f1']['value']:
            best_metrics['highest_weighted_f1'] = {'value': weighted_avg.get('f1-score', 0), 'model_index': index, 'model': model}
        if macro_avg.get('precision', 0) > best_metrics['highest_macro_precision']['value']:
            best_metrics['highest_macro_precision'] = {'value': macro_avg.get('precision', 0), 'model_index': index, 'model': model}
        if macro_avg.get('recall', 0) > best_metrics['highest_macro_recall']['value']:
            best_metrics['highest_macro_recall'] = {'value': macro_avg.get('recall', 0), 'model_index': index, 'model': model}

    return best_metrics


In [None]:
lst_reports = [svc_report,lr_report, rfc_report, nb_report, gb_report]

In [None]:
best_metrics = get_best_model(lst_reports)

In [None]:
for metric, result in best_metrics.items(): 
    print(f"{metric}: Value = {result['value']}, Model Index = {result['model_index']}")

In [None]:
lst_reports[2]

In [None]:
#Due to this, the best average model is RFC, which is random forest classifier!