In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# In order to achieve this we will need to import LabelBinarizer
from sklearn.preprocessing import LabelBinarizer

# Set figure size to (12,6)
plt.rcParams['figure.figsize'] = (12,6)



# Cross Validation

## 1) Define Business Goal

Build a model that can accurately classify the species of a penguin given its Culmen Length (mm), Culmen Depth (mm), Flipper Length (mm), Body Mass (g) and Sex.

**!!!The value chosen is arbitrary by Stefan!!!***<br>
The model will be helpful if it is able to predict 70% of the observations correctly. (Accuracy: 0.7)


## 2) Get the data

In [4]:
df = pd.read_csv('penguins_simple.csv', sep=';')
df.head()

Unnamed: 0,Species,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Sex
0,Adelie,39.1,18.7,181.0,3750.0,MALE
1,Adelie,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,36.7,19.3,193.0,3450.0,FEMALE
4,Adelie,39.3,20.6,190.0,3650.0,MALE


In [5]:
# Get rid of the Gentoos
df = df[df['Species'] != 'Gentoo']

In [6]:
# Inspect the shape of the DataFrame
df.shape

(214, 6)

## 3) Split the Data

Why are we doing this again?

- We want to make sure the model generalizes well to unseen data
- We want to prevent the model from overfitting on some random patterns in the training data
- Therefore we separate part of the data (the test data) and keep it locked up until we are done with our modelling process
- Calculating the evaluation metrics on the test data gives us an **estimate on how well the model is doing on unseen data/how well the model is able to generalize**

Based on the outcome of our model on the test data we decide whether we will go forward and actually use the model in practice (deploy the model).

In [7]:
# Assign X and y
X = df[['Flipper Length (mm)', 'Sex']]
y = df.Species

In [8]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Inspect the shapes
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((160, 2), (54, 2), (160,), (54,))

## 4) Exploratory Data Analysis

Remember to do the exploratory data analysis on the training data only.

In [9]:
# For the analysis we can merge X_train and y_train back together
df_train = pd.merge(y_train, X_train, left_index=True, right_index=True)
df_train.head()

Unnamed: 0,Species,Flipper Length (mm),Sex
29,Adelie,195.0,FEMALE
19,Adelie,180.0,MALE
185,Chinstrap,205.0,MALE
96,Adelie,183.0,FEMALE
164,Chinstrap,190.0,FEMALE


## 5) Feature Engineer

In [10]:
# Create a column transformer

column_transformer = ColumnTransformer([
    # (name, Transformation (eg. OneHotEncoder, passthrough), columns)
    ('pass', 'passthrough', ['Flipper Length (mm)']), # 'passthrough' will just take the specified column as it is
    ('label_encoder', OneHotEncoder(drop='if_binary'), ['Sex']) # Creates a dummy column (0 or 1)
])

In [11]:
# Fit and transform the training data

X_train_fe = pd.DataFrame(column_transformer.fit_transform(X_train), columns=X_train.columns)
X_train_fe.head()

Unnamed: 0,Flipper Length (mm),Sex
0,195.0,0.0
1,180.0,1.0
2,205.0,1.0
3,183.0,0.0
4,190.0,0.0


## 6) Train the model(s)

In [12]:
m = LogisticRegression()
m.fit(X_train_fe, y_train)

LogisticRegression()

In [15]:
from sklearn.tree import DecisionTreeClassifier



In [16]:
m_dt = DecisionTreeClassifier(max_depth=100)

In [17]:
m_dt.fit(X_train_fe, y_train)

DecisionTreeClassifier(max_depth=100)

## 7) Cross-Validation

Cross-validation helps us answering two distinct questions:

**1) Does our model overfit?**

Compare the training accuracy of your model with the validation accuracy of your model, both averaged over all k iterations. If they are (subjectively) close, we conclude that there is no (significant) overfitting.

**2) Which of the trained models should we select?**

Compare the validation accuracy between different models to choose the one with the best generalization performance.

In [19]:
# Import cross_validate

from sklearn.model_selection import cross_validate

In [22]:
cross_validate_lr = cross_validate(estimator=m, X=X_train_fe, y=y_train, scoring = 'accuracy',
                                   cv=5,
               return_train_score=True)

In [23]:
cross_validate_lr

{'fit_time': array([0.034, 0.023, 0.017, 0.017, 0.022]),
 'score_time': array([0.002, 0.003, 0.002, 0.004, 0.005]),
 'test_score': array([0.812, 0.625, 0.750, 0.750, 0.812]),
 'train_score': array([0.750, 0.789, 0.758, 0.758, 0.766])}

In [26]:
cross_validate_lr['test_score'].mean()

0.75

In [27]:
cross_validate_lr['train_score'].mean()

0.7640625

In [28]:
cross_validate_dt = cross_validate(estimator=m_dt, X=X_train_fe, y=y_train, scoring = 'accuracy',
                                   cv=5,
               return_train_score=True)

In [30]:
cross_validate_dt['test_score'].mean(), cross_validate_dt['train_score'].mean()

(0.73125, 0.7984375)

$$
Accuracy = \frac{TP + TN}{TP + FP + TN + FN}
$$

Training score and cross validation score are very close to each other. This suggests that the model is not overfitting.

It would be an indication of overfitting if the training score was higher thatn the cross validation score.

## 8) Calculate Test Score

In [None]:
# Transform the test data


In [None]:
# Inspect the accuracy score


## 9) Deploy the model