## Model Training

Let's import and split the data

In [164]:
import pandas as pd

train_data = pd.read_csv("../data/processed/clean_train.csv")
test_data = pd.read_csv("../data/processed/clean_test.csv")

X_train = train_data.drop(["Outcome"], axis = 1)
y_train = train_data["Outcome"]
X_test = test_data.drop(["Outcome"], axis = 1)
y_test = test_data["Outcome"]

In [165]:
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

model = RandomForestClassifier(random_state = 666)
model.fit(X_train, y_train)

In [166]:
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_test)

accuracy_score(y_test, y_pred)

0.8

### Model Optimization

In [167]:
from sklearn.model_selection import GridSearchCV

hyperparams = {
    "min_samples_leaf": [1, 2, 4, 8],
    "min_samples_split": [2, 5, 10, 20],
    "n_estimators": [10, 20, 40, 100],
    "max_depth": [None, 5, 10, 20],
}

grid = GridSearchCV(model, hyperparams, scoring = "accuracy", cv = 10)
grid.fit(X_train, y_train)

print(f"Best hyperparameters: {grid.best_params_}")
print(f"Best score: {grid.best_score_}")



Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best score: 0.7687537810042347


In [168]:
model = RandomForestClassifier(
    criterion='gini',
    max_depth=5,
    min_samples_leaf=2,
    min_samples_split=10,
    n_estimators=40,
    random_state = 42
    )
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy_score(y_test, y_pred)

0.8344827586206897

In [169]:
from pickle import dump

with open("../models/decision-tree-1.0.pkl", "wb") as f:
    dump(model, f)