# Model Research

Debido a la naturaleza del dataset, se probarán los siguientes modelos:
- RandomForestClassifier: Debido a su adaptabilidad en problemas de clasificación multiclase, robustez al sobreajuste y capacidad para manejar datos desbalanceados.
- XGBoostClassifier: Debido a su precisión en el entrenamiento gracias al boost de clases, soporte para multiclase con multi:softmax y escalabilidad.
- AdaBoostClassifier: Debido a su enfoque en muestras difíciles para mejorar clases complicadas y simplicidad en problemas multiclase con SAMME.

## Import libraries

In [2]:
# import needed libraries
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, precision_recall_curve
from xgboost import XGBClassifier
# split data set
from sklearn.model_selection import train_test_split
from helpers import basic_eda as beda

## Process unbalanced dataframe

In [3]:
# import dataset
curated_data = pd.read_csv('../data/curated/curated_records.csv')
curated_data.head()

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,serum_cholesterol,fasting_blood_sugar_g_120,resting_egc_results,maximum_heart_rate,exersize_induced_angina,st_depression,slope_of_peak_exercise_st_segment,major_vessels,thalassemia,target
0,63,male,typical angina,145,233,True,left_ventricular_hypertrophy,150,False,2.3,downsloping,0.0,fixed_defect,0
1,67,male,asymptomatic,160,286,False,left_ventricular_hypertrophy,108,True,1.5,flat,3.0,normal,2
2,67,male,asymptomatic,120,229,False,left_ventricular_hypertrophy,129,True,2.6,flat,2.0,reversible_defect,1
3,37,male,non-anginal pain,130,250,False,normal,187,False,3.5,downsloping,0.0,normal,0
4,41,female,atypical angina,130,204,False,left_ventricular_hypertrophy,172,False,1.4,upsloping,0.0,normal,0


In [4]:
curated_data.value_counts('target')

target
0    160
1     54
2     35
3     35
4     13
Name: count, dtype: int64

In [5]:
categorical_columns = curated_data.columns
categorical_columns
curated_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 297 entries, 0 to 296
Data columns (total 14 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   age                                297 non-null    int64  
 1   sex                                297 non-null    object 
 2   chest_pain_type                    297 non-null    object 
 3   resting_blood_pressure             297 non-null    int64  
 4   serum_cholesterol                  297 non-null    int64  
 5   fasting_blood_sugar_g_120          297 non-null    bool   
 6   resting_egc_results                297 non-null    object 
 7   maximum_heart_rate                 297 non-null    int64  
 8   exersize_induced_angina            297 non-null    bool   
 9   st_depression                      297 non-null    float64
 10  slope_of_peak_exercise_st_segment  297 non-null    object 
 11  major_vessels                      297 non-null    float64

In [6]:
dummy_columns = ['sex', 'chest_pain_type', 'fasting_blood_sugar_g_120', 'resting_egc_results', 'exersize_induced_angina', 'slope_of_peak_exercise_st_segment', 'thalassemia']

In [7]:
# get dummies for categorical variables
dummy_data = pd.get_dummies(curated_data, drop_first=True, 
               columns= dummy_columns, 
               dtype=int)

In [8]:
dummy_data.shape

(297, 19)

In [9]:
train_validation, test = train_test_split(dummy_data, 
                                          test_size=0.2, 
                                          random_state=42,
                                          stratify=dummy_data['target']
                                          )

In [10]:
train, validation = train_test_split(train_validation,
                                     test_size=0.25, 
                                     random_state=42,
                                     stratify=train_validation['target'] 
                                     )

In [11]:
train.shape

(177, 19)

In [12]:
# separate features and target
X_train, Y_train = beda.separate_target(train, 'target')
X_validation, Y_validation = beda.separate_target(validation, 'target')
X_test, Y_test = beda.separate_target(test, 'target')


## Model creation

In [13]:
baggin_clf = BaggingClassifier(estimator=DecisionTreeClassifier(max_features='sqrt',
                                                                max_depth=5,
                                                                max_leaf_nodes=5, 
                                                                splitter='random'),
                               n_estimators=1000,
                               n_jobs=-1,
                               random_state=42,
                               bootstrap=True)

forest_clf = RandomForestClassifier(n_estimators=1000,
                                    max_depth=5,
                                    random_state=42,
                                    n_jobs=-1,
                                    max_features='sqrt',
                                    max_leaf_nodes=5,
                                    bootstrap=True)
ada_clf = AdaBoostClassifier(n_estimators=5,
                            learning_rate=0.01,
                            estimator=RandomForestClassifier(max_features='sqrt',
                                                            max_leaf_nodes=5,
                                                            max_depth=5,
                                                            n_jobs=-1,
                                                            n_estimators=1000),
                            random_state=42)
xgb_clf = XGBClassifier(n_estimators=1000,
                        random_state=123,
                        n_jobs=-1,
                        objective='multi:softmax',
                        eta=0.1,
                        booster='dart',
                        max_depth=10,
                        subsample=0.9,
                        alpha=0.1,
                        tree_method='hist',
                        num_class= 4,
                        eval_metric='mlogloss',
                        gamma=0.1,
                        min_child_weight=1                 
)

## Fit and evaluate models for original dataframe

In [14]:
# fit models
forest_clf.fit(X_train, Y_train)
ada_clf.fit(X_train, Y_train)
xgb_clf.fit(X_train, Y_train)
baggin_clf.fit(X_train, Y_train)

In [15]:
# evaluate models
forest_data = {'name': 'Random Forest', 
               'model': forest_clf,
               'X_validation': X_validation, 
               'Y_validation': Y_validation}
ada_data = {'name': 'AdaBoost',
             'model': ada_clf,
             'X_validation': X_validation, 
             'Y_validation': Y_validation}
prediction_forest = beda.evaluate_model(**forest_data)
prediction_ada = beda.evaluate_model(**ada_data)
xgb_data = {'name': 'XGBoost',
             'model': xgb_clf,
             'X_validation': X_validation, 
             'Y_validation': Y_validation}
prediction_xgb = beda.evaluate_model(**xgb_data)
bagging_data = {'name': 'Bagging',
                'model': baggin_clf,
                'X_validation': X_validation, 
                'Y_validation': Y_validation}
prediction_bagging = beda.evaluate_model(**bagging_data)

.:EVALUATING MODEL: Random Forest:.
Predicciones
Predicción clase 0: 54
Predicción clase 1: 5
Predicción clase 2: 1
Métricas
F1 Score: 0.4198158914728682
F1 Score (macro): 0.17383720930232557
Precision: 0.352716049382716
Precision (macro): 0.1585185185185185
Recall: 0.55
Recall (macro): 0.21818181818181817
Score: 0.55
.:EVALUATING MODEL: AdaBoost:.
Predicciones
Predicción clase 0: 52
Predicción clase 1: 3
Predicción clase 2: 4
Predicción clase 3: 1
Métricas
F1 Score: 0.474963924963925
F1 Score (macro): 0.25367965367965367
Precision: 0.44764957264957267
Precision (macro): 0.2897435897435897
Recall: 0.5833333333333334
Recall (macro): 0.2753246753246753
Score: 0.5833333333333334
.:EVALUATING MODEL: XGBoost:.
Predicciones
Predicción clase 0: 36
Predicción clase 1: 10
Predicción clase 2: 5
Predicción clase 3: 8
Predicción clase 4: 1
Métricas
F1 Score: 0.549561157796452
F1 Score (macro): 0.4102521008403361
Precision: 0.5537499999999999
Precision (macro): 0.505
Recall: 0.5666666666666667
Reca

## Test for an oversampled dataset

According to the information the best model with an unbalanced dataset is xgboost so now I´ll try using a oversampling algrithm to increase algorithms eficiency.


In [16]:
import numpy as np

In [17]:
oversampled_df = beda.oversample(curated_data)
# Get the counts of each class
counts = oversampled_df['target'].value_counts()
majority_class = counts.idxmax()
minority_class = counts.idxmin()
while np.abs(counts[minority_class] - counts[majority_class]) > 15:
    oversampled_df = beda.oversample(oversampled_df)
    # Get the counts of each class
    counts = oversampled_df['target'].value_counts()
    majority_class = counts.idxmax()
    minority_class = counts.idxmin()
    if np.floor(counts[majority_class]/counts[minority_class]) == 1:
        break

In [18]:
# create dummies
dummy_columns = ['sex', 'chest_pain_type', 'fasting_blood_sugar_g_120', 'resting_egc_results', 'exersize_induced_angina', 'slope_of_peak_exercise_st_segment', 'thalassemia']


In [19]:
# get dummies for categorical variables
dummy_data_os = pd.get_dummies(oversampled_df, drop_first=True, 
               columns= dummy_columns, 
               dtype=int)

In [20]:
from helpers import basic_eda as beda
from sklearn.model_selection import train_test_split

In [21]:
#divide data into train, validation and test sets
train_validation, test = train_test_split(dummy_data,
                                          test_size=0.25, 
                                          random_state=42,
                                          stratify=dummy_data['target']
                                          )
train, validation = train_test_split(train_validation,
                                     test_size=0.25, 
                                     random_state=42,
                                     stratify=train_validation['target'] 
                                     )

In [22]:
# oversampled data
train_over = beda.oversample(train)

In [23]:
# separate features from target
X_train, Y_train = beda.separate_target(train_over, 'target')
X_validation, Y_validation = beda.separate_target(validation, 'target')
X_test, Y_test = beda.separate_target(test, 'target')


In [24]:
train_over.value_counts('target')

target
0    90
4    88
1    30
2    19
3    19
Name: count, dtype: int64

In [25]:
#train model
forest_clf.fit(X_train, Y_train)


In [26]:
ada_clf.fit(X_train, Y_train)


In [27]:
xgb_clf.fit(X_train, Y_train)


In [28]:
baggin_clf.fit(X_train, Y_train)

In [29]:
# evaluate models
forest_data = {'name': 'Random Forest', 
               'model': forest_clf,
               'X_validation': X_validation, 
               'Y_validation': Y_validation}
ada_data = {'name': 'AdaBoost',
             'model': ada_clf,
             'X_validation': X_validation, 
             'Y_validation': Y_validation}
prediction_forest = beda.evaluate_model(**forest_data)
prediction_ada = beda.evaluate_model(**ada_data)


.:EVALUATING MODEL: Random Forest:.
Predicciones
Predicción clase 0: 43
Predicción clase 4: 13
Métricas
F1 Score: 0.43515981735159814
F1 Score (macro): 0.21223744292237443
Precision: 0.3667901865576284
Precision (macro): 0.1656529516994633
Recall: 0.5535714285714286
Recall (macro): 0.39333333333333337
Score: 0.5535714285714286
.:EVALUATING MODEL: AdaBoost:.
Predicciones
Predicción clase 0: 46
Predicción clase 4: 10
Métricas
F1 Score: 0.42073934837092736
F1 Score (macro): 0.2192982456140351
Precision: 0.34487577639751554
Precision (macro): 0.16608695652173916
Recall: 0.5535714285714286
Recall (macro): 0.39333333333333337
Score: 0.5535714285714286


In [30]:
xgb_data = {'name': 'XGBoost',
             'model': xgb_clf,
             'X_validation': X_validation, 
             'Y_validation': Y_validation}
prediction_xgb = beda.evaluate_model(**xgb_data)


.:EVALUATING MODEL: XGBoost:.
Predicciones
Predicción clase 0: 33
Predicción clase 1: 10
Predicción clase 2: 6
Predicción clase 3: 3
Predicción clase 4: 4
Métricas
F1 Score: 0.5541077969649397
F1 Score (macro): 0.3869352869352869
Precision: 0.5516774891774892
Precision (macro): 0.39484848484848484
Recall: 0.5714285714285714
Recall (macro): 0.41238095238095235
Score: 0.5714285714285714


In [31]:
bagging_data = {'name': 'Bagging',
                'model': baggin_clf,
                'X_validation': X_validation, 
                'Y_validation': Y_validation}
prediction_bagging = beda.evaluate_model(**bagging_data)

.:EVALUATING MODEL: Bagging:.
Predicciones
Predicción clase 0: 41
Predicción clase 4: 15
Métricas
F1 Score: 0.4418274351994319
F1 Score (macro): 0.18690969345484673
Precision: 0.38130081300813007
Precision (macro): 0.15479674796747966
Recall: 0.5357142857142857
Recall (macro): 0.29333333333333333
Score: 0.5357142857142857


In [32]:
from sklearn.metrics import classification_report

In [33]:
pd.DataFrame(classification_report(Y_validation, prediction_xgb, output_dict=True)).T.to_latex('../Report_2/tables/xgb.tex', float_format='{:.2f}'.format)

In [34]:

pd.DataFrame(classification_report(Y_validation, prediction_forest, output_dict=True)).T.to_latex('../Report_2/tables/forest.tex', float_format='{:.2f}'.format)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [35]:
pd.DataFrame(classification_report(Y_validation, prediction_ada, output_dict=True)).T.to_latex('../Report_2/tables/ada.tex', float_format='{:.2f}'.format)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [36]:
pd.DataFrame(classification_report(Y_validation, prediction_bagging, output_dict=True)).T.to_latex('../Report_2/tables/baggin.tex', float_format='{:.2f}'.format)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [37]:
%pip install jinja2

Note: you may need to restart the kernel to use updated packages.
