In [1]:
# Basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tabulate import tabulate
from IPython.core.display import HTML

# Stats libraries
import ppscore as pps
import statsmodels.api as sm
import pingouin as pg
from scipy import stats

# Feature processing, hyperparameter tuning, and validation libraries
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
SEED = 1000
pd.options.mode.copy_on_write = True 

# Load data
try:
    dirty_hair = pd.read_csv('Predict Hair Fall.csv')
except FileNotFoundError:
    print('Predict Hair Fall.csv is not in your present working directory')

# Clean data
dirty_hair.columns = dirty_hair.columns.str.strip().str.replace(" ", "_")
dirty_hair = dirty_hair[(dirty_hair['Medical_Conditions'] != 'No Data') & 
                        (dirty_hair['Medications_&_Treatments'] != 'No Data') & 
                        (dirty_hair['Nutritional_Deficiencies'] != 'No Data')]
clean_hair = dirty_hair.drop_duplicates(subset='Id', keep='first')

print(f"Number of rows with duplicated Id: {dirty_hair['Id'].duplicated().sum()}")
print(f"After cleaning: {len(clean_hair)} rows remaining.")


In [None]:
def has_duplicates(df):
    duplicates = df.duplicated()
    print("\nDuplicates exist in the DataFrame." if duplicates.any() else "\nNo duplicates found in the DataFrame.")

def count_outliers(series):
    Q1, Q3 = series.quantile([0.25, 0.75])
    IQR = Q3 - Q1
    lower_bound, upper_bound = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
    outliers = series[(series < lower_bound) | (series > upper_bound)].count()
    print(f'There are {outliers} outliers in this feature')

def cat_map(df):
    for column in df.columns:
        if df[column].dtype == 'object':
            df[column] = df[column].astype('category').cat.codes
    return df

def binary_encoding(df):
    for col in df.columns:
        if df[col].nunique() == 2:
            df[col] = df[col].astype('category').cat.codes
    return df

def dummy_encoding(df):
    return pd.get_dummies(df, drop_first=True)

In [None]:
eda_df = clean_hair.copy()

# Categorical features pie chart
categorical_df = eda_df.select_dtypes(include='object').apply(lambda col: col.astype('category'))
fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(12, 6))
for i, (column, data) in enumerate(categorical_df.items()):
    counts = data.value_counts()
    axes.flat[i].pie(counts, labels=counts.index, autopct='%1.1f%%', startangle=140)
    axes.flat[i].set_title(column)
plt.tight_layout()
plt.show()

# Bar plot for Medical Conditions
plt.figure(figsize=(10, 6))
eda_df['Medical_Conditions'].value_counts().plot(kind='bar', color='skyblue')
plt.title('Number of Records for Each Medical Condition')
plt.xticks(rotation=45)
plt.show()

# Q-Q plot and histogram for Age
plt.figure()
sm.qqplot(eda_df['Age'], line='q')
plt.title('Age')
plt.show()

sns.histplot(clean_hair['Age'], bins=5, kde=True)
plt.title('Distribution of Age')
plt.show()

# Outliers and normality checks
pg.normality(eda_df['Age'])
pg.homoscedasticity(eda_df, dv='Age', group='Hair_Loss')
count_outliers(eda_df['Age'])

In [None]:
# Chi-square tests
categorical_features = categorical_df.columns.tolist()
for col1, col2 in [(col1, col2) for i, col1 in enumerate(categorical_features) for col2 in categorical_features[i+1:]]:
    if 'Hair_Loss' in (col1, col2):
        chi2_res = pg.chi2_independence(categorical_df, col1, col2)
        print(f"Chi-square test for {col1} and {col2}")
        display(chi2_res[2].style.background_gradient(cmap='Reds', subset='pval'))

# Summary of chi-square tests
chisq_df = eda_df[['Genetics', 'Hormonal_Changes', 'Medical_Conditions', 'Medications_&_Treatments', 'Nutritional_Deficiencies', 'Stress', 'Poor_Hair_Care_Habits', 'Environmental_Factors', 'Smoking', 'Weight_Loss', 'Hair_Loss']]
chisq_results = [(col1, col2, pg.chi2_independence(chisq_df, col1, col2)[2]['pval'][0]) for col1, col2 in [(col1, col2) for i, col1 in enumerate(chisq_df.columns) for col2 in chisq_df.columns[i+1:]] if 'Hair_Loss' in (col1, col2)]
chisq_summary = pd.DataFrame(chisq_results, columns=['Feature 1', 'Feature 2', 'p-value'])
display(HTML(tabulate(chisq_summary, headers='keys', tablefmt='html')))

# Predictive Power Score (PPS)
corr_matrix_pps = pps.matrix(eda_df.drop('Id', axis=1))[['x', 'y', 'ppscore']].pivot(index='y', columns='x', values='ppscore')
sns.heatmap(corr_matrix_pps, cmap='coolwarm', annot=True)
plt.title("Predictive Power Score (PPS)")
plt.show()

In [None]:
def tree_importance(df):
    df = cat_map(df.copy())
    y = df['Hair_Loss']
    X = df.drop(['Hair_Loss', 'Id'], axis=1)
    rf_classifier = RandomForestClassifier(n_estimators=100, random_state=SEED)
    rf_classifier.fit(X, y)
    feature_importance = pd.Series(rf_classifier.feature_importances_, index=X.columns).sort_values(ascending=False)
    sns.barplot(x=feature_importance, y=feature_importance.index, palette='viridis')
    plt.title('Feature Importance')
    plt.show()

tree_importance(clean_hair)

In [None]:
def linear_importance(df):
    df = cat_map(df.copy())
    scaler = MinMaxScaler()
    df['Age'] = scaler.fit_transform(df[['Age']])
    y = df['Hair_Loss']
    X = df.drop(['Hair_Loss', 'Id'], axis=1)
    est = sm.OLS(y, X).fit()
    return HTML(est.summary().tables[1].as_html())

linear_importance(clean_hair)

In [None]:
def model_preprocessing(df, features, linear=False):
    df = df.copy()
    if linear:
        df['Age'] = MinMaxScaler().fit_transform(df[['Age']])
    y = df['Hair_Loss']
    X = df[features].copy()
    X['Stress'] = X['Stress'].map({'High': 1, 'Moderate': 0, 'Low': -1})
    X = binary_encoding(X)
    X = dummy_encoding(X)
    return train_test_split(X, y, test_size=0.3, random_state=SEED)

In [None]:
linear_features = ['Genetics', 'Hormonal_Changes', 'Medical_Conditions', 'Stress', 'Age', 'Poor_Hair_Care_Habits', 'Weight_Loss']
X_train_lin, X_test_lin, y_train_lin, y_test_lin = model_preprocessing(clean_hair, linear_features, linear=True)

In [None]:
tree_features = ['Age', 'Nutritional_Deficiencies', 'Medical_Conditions', 'Medications_&_Treatments', 'Stress', 'Environmental_Factors', 'Hormonal_Changes']
X_train_tree, X_test_tree, y_train_tree, y_test_tree = model_preprocessing(clean_hair, tree_features)

In [None]:
def linear_model_params(X_train, X_test, y_train, y_test):
    param_grid = {
        'C': np.logspace(-3, 3, 100),
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear'],
        'max_iter': np.random.randint(100, 1000, 10)
    }
    f2_scorer = make_scorer(fbeta_score, beta=2)
    log_reg = GridSearchCV(LogisticRegression(random_state=SEED), param_grid, cv=5, scoring=f2_scorer, verbose=1, n_jobs=-1)
    log_reg.fit(X_train, y_train)
    best_log_reg = log_reg.best_estimator_
    y_pred_train, y_pred_test = best_log_reg.predict(X_train), best_log_reg.predict(X_test)
    print(f'Logistic Regression F2 Train: {fbeta_score(y_train, y_pred_train, beta=2):.4f}')
    print(f'Logistic Regression F2 Test: {fbeta_score(y_test, y_pred_test, beta=2):.4f}')

    svc = GridSearchCV(SVC(random_state=SEED), {
        'C': np.logspace(-3, 3, 100),
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto']
    }, cv=5, scoring=f2_scorer, verbose=1, n_jobs=-1)
    svc.fit(X_train, y_train)
    best_svc = svc.best_estimator_
    y_pred_train, y_pred_test = best_svc.predict(X_train), best_svc.predict(X_test)
    print(f'Support Vector F2 Train: {fbeta_score(y_train, y_pred_train, beta=2):.4f}')
    print(f'Support Vector F2 Test: {fbeta_score(y_test, y_pred_test, beta=2):.4f}')
    return log_reg.best_params_, svc.best_params_

lgr_params, svc_params = linear_model_params(X_train_lin, X_test_lin, y_train_lin, y_test_lin)

In [None]:
def tree_model_params(X_train, X_test, y_train, y_test):
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 4, 5],
        'learning_rate': [0.1, 0.01, 0.001],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0]
    }
    f2_scorer = make_scorer(fbeta_score, beta=2)
    xgb = GridSearchCV(XGBClassifier(random_state=SEED), param_grid, cv=5, scoring=f2_scorer, verbose=1, n_jobs=-1)
    xgb.fit(X_train, y_train)
    best_xgb = xgb.best_estimator_
    y_pred_train, y_pred_test = best_xgb.predict(X_train), best_xgb.predict(X_test)
    print(f'XGBoost F2 Train: {fbeta_score(y_train, y_pred_train, beta=2):.4f}')
    print(f'XGBoost F2 Test: {fbeta_score(y_test, y_pred_test, beta=2):.4f}')

    param_grid = {
        'iterations': [100, 200, 300],
        'depth': [3, 4, 5],
        'learning_rate': [0.1, 0.01, 0.001],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bylevel': [0.8, 0.9, 1.0]
    }
    catboost = GridSearchCV(CatBoostClassifier(random_state=SEED, silent=True), param_grid, cv=5, scoring=f2_scorer, verbose=1, n_jobs=-1)
    catboost.fit(X_train, y_train)
    best_catboost = catboost.best_estimator_
    y_pred_train, y_pred_test = best_catboost.predict(X_train), best_catboost.predict(X_test)
    print(f'CatBoost F2 Train: {fbeta_score(y_train, y_pred_train, beta=2):.4f}')
    print(f'CatBoost F2 Test: {fbeta_score(y_test, y_pred_test, beta=2):.4f}')
    return xgb.best_params_, catboost.best_params_

xgb_params, cat_params = tree_model_params(X_train_tree, X_test_tree, y_train_tree, y_test_tree)

In [None]:
f2_scorer = make_scorer(fbeta_score, beta=2)
classifiers = {
    'Logistic Regression': LogisticRegression(**lgr_params),
    'Support Vector Machine': SVC(**svc_params, probability=True)
}

# Plot ROC curves for each model
plt.figure(figsize=(10, 8))
f2_scores = {}
for name, clf in classifiers.items():
    clf.fit(X_train_lin, y_train_lin)
    y_pred = clf.predict(X_test_lin)
    f2_scores[name] = fbeta_score(y_test_lin, y_pred, beta=2)

# Plot bar plot with error bars for F2 scores
plt.figure(figsize=(10, 6))
names, values = list(f2_scores.keys()), list(f2_scores.values())
errors = [np.std(cross_val_score(clf, X_train_lin, y_train_lin, scoring=f2_scorer, cv=5)) for clf in classifiers.values()]
mean_scores = [np.mean(cross_val_score(clf, X_train_lin, y_train_lin, scoring=f2_scorer, cv=5)) for clf in classifiers.values()]

for i, v in enumerate(mean_scores):
    plt.text(i, v + 0.01, str(round(v, 2)), ha='left', va='baseline')

plt.bar(names, values, yerr=errors, capsize=10, color='skyblue')
plt.xlabel('Classifier')
plt.ylabel('F2 Score')
plt.title('F2 Score of Different Classifiers')
plt.ylim([0, 1])
plt.show()

In [None]:
classifiers = {
    'XGBoostClassifier': XGBClassifier(**xgb_params),
    'CatBoostClassifier': CatBoostClassifier(**cat_params, silent=True)
}

# Plot ROC curves for each model
plt.figure(figsize=(10, 8))
f2_scores = {}
for name, clf in classifiers.items():
    clf.fit(X_train_tree, y_train_tree)
    y_pred = clf.predict(X_test_tree)
    f2_scores[name] = fbeta_score(y_test_tree, y_pred, beta=2)

# Plot bar plot with error bars for F2 scores
plt.figure(figsize=(10, 6))
names, values = list(f2_scores.keys()), list(f2_scores.values())
errors = [np.std(cross_val_score(clf, X_train_tree, y_train_tree, scoring=f2_scorer, cv=5)) for clf in classifiers.values()]
mean_scores = [np.mean(cross_val_score(clf, X_train_tree, y_train_tree, scoring=f2_scorer, cv=5)) for clf in classifiers.values()]

for i, v in enumerate(mean_scores):
    plt.text(i, v + 0.01, str(round(v, 2)), ha='left', va='baseline')
    
plt.bar(names, values, yerr=errors, capsize=10, color='skyblue')
plt.xlabel('Classifier')
plt.ylabel('F2 Score')
plt.title('F2 Score of Different Classifiers')
plt.ylim([0, 1])
plt.show()