# Machine Learning

---

In [125]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, export_text, plot_tree
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import accuracy_score

import warnings

def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn # ignore annoying warning (from sklearn and seaborn)

## Step 1: Decision making: Which is the best dataset



In [126]:
# Train data frames
X_train_with_outliers_sel = pd.read_csv('../data/processed/X_train_with_outliers_sel.csv')
X_train_without_outliers_sel = pd.read_csv('../data/processed/X_train_without_outliers_sel.csv')
X_train_with_outliers_norm_sel = pd.read_csv('../data/processed/X_train_with_outliers_norm_sel.csv')
X_train_without_outliers_norm_sel = pd.read_csv('../data/processed/X_train_without_outliers_norm_sel.csv')
X_train_with_outliers_minmax_sel = pd.read_csv('../data/processed/X_train_with_outliers_minmax_sel.csv')
X_train_without_outliers_minmax_sel = pd.read_csv('../data/processed/X_train_without_outliers_minmax_sel.csv')
y_train = pd.read_csv('../data/processed/y_train.csv')

# Test data frames
X_test_with_outliers_sel = pd.read_csv('../data/processed/X_test_with_outliers_sel.csv')
X_test_without_outliers_sel = pd.read_csv('../data/processed/X_test_without_outliers_sel.csv')
X_test_with_outliers_norm_sel = pd.read_csv('../data/processed/X_test_with_outliers_norm_sel.csv')
X_test_without_outliers_norm_sel = pd.read_csv('../data/processed/X_test_without_outliers_norm_sel.csv')
X_test_with_outliers_minmax_sel = pd.read_csv('../data/processed/X_test_with_outliers_minmax_sel.csv')
X_test_without_outliers_minmax_sel = pd.read_csv('../data/processed/X_test_without_outliers_minmax_sel.csv')
y_test = pd.read_csv('../data/processed/y_test.csv')

In [127]:
train_dicts = {
  "X_train_with_outliers_sel": X_train_with_outliers_sel,
  "X_train_without_outliers_sel": X_train_without_outliers_sel,
  "X_train_with_outliers_norm_sel": X_train_with_outliers_norm_sel,
  "X_train_without_outliers_norm_sel": X_train_without_outliers_norm_sel,
  "X_train_with_outliers_minmax_sel": X_train_with_outliers_minmax_sel,
  "X_train_without_outliers_minmax_sel": X_train_without_outliers_minmax_sel
}

test_dicts = {
  "X_test_with_outliers_sel": X_test_with_outliers_sel,
  "X_test_without_outliers_sel": X_test_without_outliers_sel,
  "X_test_with_outliers_norm_sel": X_test_with_outliers_norm_sel,
  "X_test_without_outliers_norm_sel": X_test_without_outliers_norm_sel,
  "X_test_with_outliers_minmax_sel": X_test_with_outliers_minmax_sel,
  "X_test_without_outliers_minmax_sel": X_test_without_outliers_minmax_sel
}

train_dfs = [
  X_train_with_outliers_sel,
  X_train_without_outliers_sel,
  X_train_with_outliers_norm_sel,
  X_train_without_outliers_norm_sel,
  X_train_with_outliers_minmax_sel,
  X_train_without_outliers_minmax_sel
]
test_dfs = [
  X_test_with_outliers_sel,
  X_test_without_outliers_sel,
  X_test_with_outliers_norm_sel,
  X_test_without_outliers_norm_sel,
  X_test_with_outliers_minmax_sel,
  X_test_without_outliers_minmax_sel
]

print(f"X_train_with_outliers_sel shape: {X_train_with_outliers_sel.shape} ")
print(f"X_train_without_outliers_sel shape: {X_train_without_outliers_sel.shape} ")
print(f"X_train_with_outliers_norm_sel shape: {X_train_with_outliers_norm_sel.shape} ")
print(f"X_train_without_outliers_norm_sel shape: {X_train_without_outliers_norm_sel.shape} ")
print(f"X_train_with_outliers_minmax_sel shape: {X_train_with_outliers_minmax_sel.shape} ")
print(f"X_train_without_outliers_minmax_sel shape: {X_train_without_outliers_minmax_sel.shape}\n ")

print(f"X_test_with_outliers_sel shape: {X_test_with_outliers_sel.shape} ")
print(f"X_test_without_outliers_sel shape: {X_test_without_outliers_sel.shape} ")
print(f"X_test_with_outliers_norm_sel shape: {X_test_with_outliers_norm_sel.shape} ")
print(f"X_test_without_outliers_norm_sel shape: {X_test_without_outliers_norm_sel.shape} ")
print(f"X_test_with_outliers_minmax_sel shape: {X_test_with_outliers_minmax_sel.shape} ")
print(f"X_test_without_outliers_minmax_sel shape: {X_test_without_outliers_minmax_sel.shape}\n ")

X_train_with_outliers_sel shape: (614, 8) 
X_train_without_outliers_sel shape: (614, 8) 
X_train_with_outliers_norm_sel shape: (614, 8) 
X_train_without_outliers_norm_sel shape: (614, 8) 
X_train_with_outliers_minmax_sel shape: (614, 8) 
X_train_without_outliers_minmax_sel shape: (614, 8)
 
X_test_with_outliers_sel shape: (154, 8) 
X_test_without_outliers_sel shape: (154, 8) 
X_test_with_outliers_norm_sel shape: (154, 8) 
X_test_without_outliers_norm_sel shape: (154, 8) 
X_test_with_outliers_minmax_sel shape: (154, 8) 
X_test_without_outliers_minmax_sel shape: (154, 8)
 


In [128]:
results = []

for df_index in range(len(train_dfs)):
  model = DecisionTreeClassifier(random_state = 42) # Model initialization and training

  train_df = train_dfs[df_index]
  model.fit(train_df, y_train)

  '''y_train_pred = model.predict(train_df)'''
  y_test_pred = model.predict(test_dfs[df_index]) # Model prediction

  results.append(
    {
        "index": df_index,
        "train_df": list(train_dicts.keys())[df_index],
        "Accuracy_score": accuracy_score(y_test, y_test_pred)
  })

resultados = sorted(results, key = lambda x: x["Accuracy_score"], reverse = True)
resultados

[{'index': 0,
  'train_df': 'X_train_with_outliers_sel',
  'Accuracy_score': 0.7467532467532467},
 {'index': 2,
  'train_df': 'X_train_with_outliers_norm_sel',
  'Accuracy_score': 0.7467532467532467},
 {'index': 4,
  'train_df': 'X_train_with_outliers_minmax_sel',
  'Accuracy_score': 0.7467532467532467},
 {'index': 1,
  'train_df': 'X_train_without_outliers_sel',
  'Accuracy_score': 0.7402597402597403},
 {'index': 3,
  'train_df': 'X_train_without_outliers_norm_sel',
  'Accuracy_score': 0.7402597402597403},
 {'index': 5,
  'train_df': 'X_train_without_outliers_minmax_sel',
  'Accuracy_score': 0.7402597402597403}]

In [129]:
print (f"The best train dataframe is |{resultados[0]['train_df']}|.\n\
=======================================================      \n\
| Accuracy score: {resultados[0]['Accuracy_score']}   |\n\
========================================")

The best train dataframe is |X_train_with_outliers_sel|.
| Accuracy score: 0.7467532467532467   |


## Step 2: Model hyperparameters optimization

- ### 2.1 Grid SearchCV