In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt

from copy import deepcopy
from sklearn.preprocessing import LabelEncoder

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor 
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, r2_score, mean_squared_error

In [2]:
input_file = "data_and_corr_pvals.xlsx"
input_data_sheet = "data"
input_corr_sheet = "corr_and_pvals"

output_file = "ML models report.xlsx"

In [3]:
data = pd.read_excel(input_file, sheet_name=input_data_sheet)
corr_data = pd.read_excel(input_file, sheet_name=input_corr_sheet)

if "Unnamed: 0" in data.columns: # Removing the index column
    data.drop("Unnamed: 0", axis=1, inplace=True)

if "Unnamed: 0" in corr_data.columns: # Removing the index column
    corr_data = corr_data.rename(columns={"Unnamed: 0": "column_name"})

In [4]:
le = LabelEncoder()
categ=['marital_status', 'retired', 'volunteer',
       'other_dementia_risk_programs', 
       'articles_about_dementia_risk', 'active_member_club_group',
        'meeting_club_group_how_often_month',
       'host_visitors_how_often', 'outing_family_friend_how_often', 'event_movie_cinema_normal',
       'event_play_drama_normal', 'event_pub_rsl_normal',
       'event_music_recital_normal', 'event_special_performance_normal',
       'event_dancing_normal', 'event_visiting_friends_normal',
       'event_sporting_event_normal', 'event_restaurant_normal',
       'language_english_only', 'dementia_diagnosis',
       'memory_impairment_diagnosis', 'memory_change',
       'dementia_family_history', 'heart_disease_type',
       'cancer_type', 'pysch_diagnosis', 'visual_legally_blind',
       'visual_corrective_glasses', 'head_injury', 'head_injury_severity',
       'epilepsy_diagnosis', 'kidney_disease_diagnosis',
       'liver_disease_diagnosis', 'stroke_tia_attack', 'cns_diagnosis',
       'cns_diagnosis_disorder', 'b12_deficiency_diagnosis',
       'delerium_diagnosis', 'hearing_impairment', 'medications_new_since_last_surveys',
        'medications', 'visual_colour_blind',
       'gender']
dfx = data
dfx[categ] = data[categ].apply(le.fit_transform)

In [5]:
total_cols = len(corr_data["column_name"])
cols_to_take = gap = 5

In [6]:
def get_classification_report_row(classification_report, best_params=""):
    data_dict = {
                    "Precision": round(classification_report["weighted avg"]["precision"], 5),
                    "Recall": round(classification_report["weighted avg"]["recall"], 5),
                    "f1-score": round(classification_report["weighted avg"]["f1-score"], 5),
#                     "support": round(classification_report["weighted avg"]["support"], 3),
                    "Accuracy": round(classification_report["accuracy"], 5)
                }
    if best_params:
        data_dict["best_params"] = best_params
    return data_dict

In [7]:
def logistic_regression_with_ht(X_train, X_test, y_train, y_test):
    tuned_parameters={"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}
    model=GridSearchCV(LogisticRegression(), tuned_parameters, cv=10)
    model.fit(X_train,y_train)
    print("tuned hyperparameters for logistic regression:",model.best_params_)
    print("accuracy :",model.best_score_)
    lr_model=LogisticRegression(C=model.best_params_["C"],penalty=model.best_params_["penalty"])
    lr_model.fit(X_train,y_train)
    predictions = lr_model.predict(X_test)
    best_params = """C={0}\npenalty={1}""".format(model.best_params_["C"],model.best_params_["penalty"])
    return get_classification_report_row(classification_report(y_test, predictions, output_dict=True), best_params)

In [8]:
total_cols

50

In [9]:
lr_report_data, dt_report_data, rf_report_data, rf_regressor_report_data, lr_regressor_report_data, report_index = [], [], [], [], [], []

while cols_to_take <= total_cols+gap:
    # Data for classification task 
    data_to_consider = deepcopy(dfx)
    cols = list(corr_data["column_name"].iloc[:cols_to_take])
    print("Taking {0} cols -> {1}\n".format(len(cols), ", ".join(cols)))
    cols.append("memory_change")
    data_to_consider.drop(list(set(list(data_to_consider.columns)).difference(set(cols))), inplace=True, axis=1)
    report_index.append("{0}_cols".format(len(data_to_consider.columns)-1))
    X_train, X_test, y_train, y_test = train_test_split(data_to_consider.drop(columns=['memory_change'], axis=1), data_to_consider['memory_change'], test_size=0.2, random_state=42)
    
    # Logistic Regression
    lr1 = LogisticRegression()
    lr1.fit(X_train, y_train.ravel())
    predictions = lr1.predict(X_test)
    lr_report_data.append(get_classification_report_row(classification_report(y_test, predictions, output_dict=True)))
    
    # Decision Tree Classifier
    dt_classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
    dt_classifier.fit(X_train, y_train)
    predictions = dt_classifier.predict(X_test)
    dt_report_data.append(get_classification_report_row(classification_report(y_test, predictions, output_dict=True)))
    
    # Random Forest Classifier
    modelRf = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 0)
    modelRf.fit(X_train, y_train)
    y_pred = modelRf.predict(X_test)
    rf_report_data.append(get_classification_report_row(classification_report(y_test, y_pred, output_dict=True)))
    
    # Random Forest Regression
    cols.remove("memory_change")
    cols.append("PALFAMS28Percentile")
    data_to_consider = deepcopy(dfx)
    data_to_consider.drop(list(set(list(data_to_consider.columns)).difference(set(cols))), inplace=True, axis=1)
    data_to_consider.dropna(subset=["PALFAMS28Percentile"], axis=0, inplace=True)
    X_train, X_test, y_train, y_test = train_test_split(data_to_consider.drop(columns=['PALFAMS28Percentile'], axis=1), data_to_consider['PALFAMS28Percentile'], test_size=0.2, random_state=42)
    regressor = RandomForestRegressor(n_estimators = 100, random_state = 0)
    regressor.fit(X_train, y_train)
    y_pred = regressor.predict(X_test)
    rf_regressor_report_data.append({"Mean sq. error": mean_squared_error(y_test, y_pred),
                                     "Train Data Size": len(X_train),
                                     "Test Data Size": len(X_test)
                                    })
    
    # Linear Regression
    lr_regressor = LinearRegression()
    lr_regressor.fit(X_train, y_train)
    y_pred = lr_regressor.predict(X_test)
    lr_regressor_report_data.append({"Mean sq. error": mean_squared_error(y_test, y_pred),
                                     "Train Data Size": len(X_train),
                                     "Test Data Size": len(X_test)
                                    })
        
    # For next iterations
    cols_to_take += 5

Taking 5 cols -> memory_change_discussed, pysch_diagnosis, stroke_tia_attack, medications_new_since_last_surveys, head_injury

Taking 10 cols -> memory_change_discussed, pysch_diagnosis, stroke_tia_attack, medications_new_since_last_surveys, head_injury, head_injury_severity, marital_status, hearing_impairment, medications, event_movie_cinema_normal



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Taking 15 cols -> memory_change_discussed, pysch_diagnosis, stroke_tia_attack, medications_new_since_last_surveys, head_injury, head_injury_severity, marital_status, hearing_impairment, medications, event_movie_cinema_normal, visual_legally_blind, hearing_impairment_correction, kidney_disease_diagnosis, dementia_family_history, host_visitors_how_often



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Taking 20 cols -> memory_change_discussed, pysch_diagnosis, stroke_tia_attack, medications_new_since_last_surveys, head_injury, head_injury_severity, marital_status, hearing_impairment, medications, event_movie_cinema_normal, visual_legally_blind, hearing_impairment_correction, kidney_disease_diagnosis, dementia_family_history, host_visitors_how_often, heart_disease_diagnosis, event_restaurant_normal, dementia_diagnosis, visual_corrective_glasses, event_visiting_friends_normal



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Taking 25 cols -> memory_change_discussed, pysch_diagnosis, stroke_tia_attack, medications_new_since_last_surveys, head_injury, head_injury_severity, marital_status, hearing_impairment, medications, event_movie_cinema_normal, visual_legally_blind, hearing_impairment_correction, kidney_disease_diagnosis, dementia_family_history, host_visitors_how_often, heart_disease_diagnosis, event_restaurant_normal, dementia_diagnosis, visual_corrective_glasses, event_visiting_friends_normal, gender, heart_disease_type, employed, cns_diagnosis, event_pub_rsl_normal



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Taking 30 cols -> memory_change_discussed, pysch_diagnosis, stroke_tia_attack, medications_new_since_last_surveys, head_injury, head_injury_severity, marital_status, hearing_impairment, medications, event_movie_cinema_normal, visual_legally_blind, hearing_impairment_correction, kidney_disease_diagnosis, dementia_family_history, host_visitors_how_often, heart_disease_diagnosis, event_restaurant_normal, dementia_diagnosis, visual_corrective_glasses, event_visiting_friends_normal, gender, heart_disease_type, employed, cns_diagnosis, event_pub_rsl_normal, articles_about_dementia_risk, epilepsy_diagnosis, language_english_only, event_play_drama_normal, memory_impairment_diagnosis



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Taking 35 cols -> memory_change_discussed, pysch_diagnosis, stroke_tia_attack, medications_new_since_last_surveys, head_injury, head_injury_severity, marital_status, hearing_impairment, medications, event_movie_cinema_normal, visual_legally_blind, hearing_impairment_correction, kidney_disease_diagnosis, dementia_family_history, host_visitors_how_often, heart_disease_diagnosis, event_restaurant_normal, dementia_diagnosis, visual_corrective_glasses, event_visiting_friends_normal, gender, heart_disease_type, employed, cns_diagnosis, event_pub_rsl_normal, articles_about_dementia_risk, epilepsy_diagnosis, language_english_only, event_play_drama_normal, memory_impairment_diagnosis, event_sporting_event_normal, other_dementia_risk_programs, meeting_club_group_how_often_month, cancer_diagnosis, b12_deficiency_diagnosis



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Taking 40 cols -> memory_change_discussed, pysch_diagnosis, stroke_tia_attack, medications_new_since_last_surveys, head_injury, head_injury_severity, marital_status, hearing_impairment, medications, event_movie_cinema_normal, visual_legally_blind, hearing_impairment_correction, kidney_disease_diagnosis, dementia_family_history, host_visitors_how_often, heart_disease_diagnosis, event_restaurant_normal, dementia_diagnosis, visual_corrective_glasses, event_visiting_friends_normal, gender, heart_disease_type, employed, cns_diagnosis, event_pub_rsl_normal, articles_about_dementia_risk, epilepsy_diagnosis, language_english_only, event_play_drama_normal, memory_impairment_diagnosis, event_sporting_event_normal, other_dementia_risk_programs, meeting_club_group_how_often_month, cancer_diagnosis, b12_deficiency_diagnosis, active_member_club_group_num, event_special_performance_normal, cns_diagnosis_disorder, event_dancing_normal, event_music_recital_normal



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Taking 45 cols -> memory_change_discussed, pysch_diagnosis, stroke_tia_attack, medications_new_since_last_surveys, head_injury, head_injury_severity, marital_status, hearing_impairment, medications, event_movie_cinema_normal, visual_legally_blind, hearing_impairment_correction, kidney_disease_diagnosis, dementia_family_history, host_visitors_how_often, heart_disease_diagnosis, event_restaurant_normal, dementia_diagnosis, visual_corrective_glasses, event_visiting_friends_normal, gender, heart_disease_type, employed, cns_diagnosis, event_pub_rsl_normal, articles_about_dementia_risk, epilepsy_diagnosis, language_english_only, event_play_drama_normal, memory_impairment_diagnosis, event_sporting_event_normal, other_dementia_risk_programs, meeting_club_group_how_often_month, cancer_diagnosis, b12_deficiency_diagnosis, active_member_club_group_num, event_special_performance_normal, cns_diagnosis_disorder, event_dancing_normal, event_music_recital_normal, retired, visual_colour_blind, active_m

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Taking 50 cols -> memory_change_discussed, pysch_diagnosis, stroke_tia_attack, medications_new_since_last_surveys, head_injury, head_injury_severity, marital_status, hearing_impairment, medications, event_movie_cinema_normal, visual_legally_blind, hearing_impairment_correction, kidney_disease_diagnosis, dementia_family_history, host_visitors_how_often, heart_disease_diagnosis, event_restaurant_normal, dementia_diagnosis, visual_corrective_glasses, event_visiting_friends_normal, gender, heart_disease_type, employed, cns_diagnosis, event_pub_rsl_normal, articles_about_dementia_risk, epilepsy_diagnosis, language_english_only, event_play_drama_normal, memory_impairment_diagnosis, event_sporting_event_normal, other_dementia_risk_programs, meeting_club_group_how_often_month, cancer_diagnosis, b12_deficiency_diagnosis, active_member_club_group_num, event_special_performance_normal, cns_diagnosis_disorder, event_dancing_normal, event_music_recital_normal, retired, visual_colour_blind, active_m

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Taking 50 cols -> memory_change_discussed, pysch_diagnosis, stroke_tia_attack, medications_new_since_last_surveys, head_injury, head_injury_severity, marital_status, hearing_impairment, medications, event_movie_cinema_normal, visual_legally_blind, hearing_impairment_correction, kidney_disease_diagnosis, dementia_family_history, host_visitors_how_often, heart_disease_diagnosis, event_restaurant_normal, dementia_diagnosis, visual_corrective_glasses, event_visiting_friends_normal, gender, heart_disease_type, employed, cns_diagnosis, event_pub_rsl_normal, articles_about_dementia_risk, epilepsy_diagnosis, language_english_only, event_play_drama_normal, memory_impairment_diagnosis, event_sporting_event_normal, other_dementia_risk_programs, meeting_club_group_how_often_month, cancer_diagnosis, b12_deficiency_diagnosis, active_member_club_group_num, event_special_performance_normal, cns_diagnosis_disorder, event_dancing_normal, event_music_recital_normal, retired, visual_colour_blind, active_m

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
lr_report_data

[{'Precision': 0.99451,
  'Recall': 0.99448,
  'f1-score': 0.99443,
  'Accuracy': 0.99448},
 {'Precision': 0.99451,
  'Recall': 0.99448,
  'f1-score': 0.99443,
  'Accuracy': 0.99448},
 {'Precision': 0.99451,
  'Recall': 0.99448,
  'f1-score': 0.99443,
  'Accuracy': 0.99448},
 {'Precision': 0.99451,
  'Recall': 0.99448,
  'f1-score': 0.99443,
  'Accuracy': 0.99448},
 {'Precision': 0.99179,
  'Recall': 0.99171,
  'f1-score': 0.99162,
  'Accuracy': 0.99171},
 {'Precision': 0.99179,
  'Recall': 0.99171,
  'f1-score': 0.99162,
  'Accuracy': 0.99171},
 {'Precision': 0.99179,
  'Recall': 0.99171,
  'f1-score': 0.99162,
  'Accuracy': 0.99171},
 {'Precision': 0.99451,
  'Recall': 0.99448,
  'f1-score': 0.99443,
  'Accuracy': 0.99448},
 {'Precision': 0.99451,
  'Recall': 0.99448,
  'f1-score': 0.99443,
  'Accuracy': 0.99448},
 {'Precision': 0.99451,
  'Recall': 0.99448,
  'f1-score': 0.99443,
  'Accuracy': 0.99448},
 {'Precision': 0.99451,
  'Recall': 0.99448,
  'f1-score': 0.99443,
  'Accuracy'

In [11]:
lr_report = pd.DataFrame(lr_report_data, index=report_index)
dt_report = pd.DataFrame(dt_report_data, index=report_index)
rf_report = pd.DataFrame(rf_report_data, index=report_index)
rf_regressor = pd.DataFrame(rf_regressor_report_data, index=report_index)
lr_regressor_df = pd.DataFrame(lr_regressor_report_data, index = report_index)

In [12]:
with pd.ExcelWriter(output_file) as writer:
    lr_report.to_excel(writer, sheet_name="Logistic Regression", index=True, header=True)
    dt_report.to_excel(writer, sheet_name="Decision Tree", index=True, header=True)
    rf_report.to_excel(writer, sheet_name="Random Forest Classifier", index=True, header=True)
    rf_regressor.to_excel(writer, sheet_name="Random Forest Regressor", index=True, header=True)
    lr_regressor_df.to_excel(writer, sheet_name="Linear Regressor", index=True, header=True)

In [13]:
dir(lr_regressor)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_feature_names',
 '_check_n_features',
 '_decision_function',
 '_estimator_type',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_preprocess_data',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_residues',
 '_set_intercept',
 '_validate_data',
 'coef_',
 'copy_X',
 'feature_names_in_',
 'fit',
 'fit_intercept',
 'get_params',
 'intercept_',
 'n_features_in_',
 'n_jobs',
 'normalize',
 'positive',
 'predict',
 'rank_',
 'score',
 'set_params',
 'singular_']