---
# Validation of best model
---
In this notebook, ...


## Results

---

### Importing necessary library

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import joblib

### Read data file

In [14]:
churn_df = pd.read_excel('../data/churn_cleaned_featEng.xlsx')
churn_df

Unnamed: 0,Tenure Months,Monthly Charges,Churn Value,Senior Citizen_Yes,Partner_Yes,Dependents_Yes,Internet Service_Fiber optic,Internet Service_No,Online Security_Yes,Online Backup_Yes,Device Protection_Yes,Tech Support_Yes,Contract_One year,Contract_Two year,Paperless Billing_Yes,Payment Method_Credit card (automatic),Payment Method_Electronic check,Payment Method_Mailed check
0,2,53.85,1,0,0,0,0,0,1,1,0,0,0,0,1,0,0,1
1,2,70.70,1,0,0,1,1,0,0,0,0,0,0,0,1,0,1,0
2,8,99.65,1,0,0,1,1,0,0,0,1,0,0,0,1,0,1,0
3,28,104.80,1,0,1,1,1,0,0,0,1,1,0,0,1,0,1,0
4,49,103.70,1,0,0,1,1,0,0,1,1,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,72,21.15,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0
7039,24,84.80,0,0,1,1,0,0,1,0,1,1,1,0,1,0,0,1
7040,72,103.20,0,0,1,1,1,0,0,1,1,0,1,0,1,1,0,0
7041,11,29.60,0,0,1,1,0,0,1,0,0,0,0,0,1,0,1,0


---

<center>
    
## Preparing data

</center>

---

### Separate X and y features

In [15]:
# Seperate X and y features
X = churn_df.drop(columns=['Churn Value'])
y = churn_df['Churn Value']

### Split dataset (training/testing/validation)

In [16]:
# Separating the dataset into a training dataset (80%) and testing+validation (20%) dataset
X_train, X_test_validation, y_train, y_test_validation = train_test_split(X, y, train_size=0.8, random_state=5)

# Separating the testing+valisation dataset into a testing dataset (10%) and a validation dataset (10%) 
X_val, X_test, y_val, y_test = train_test_split(X_test_validation, y_test_validation, test_size=0.5, random_state=5)

### Converting data subset to dataframe 

In [17]:
X_val = pd.DataFrame(X_val, columns=X.columns)

### Scale X features

In [20]:
# Create instance of scaler
scaler = StandardScaler()

# Scale the data
X_val_scaled = scaler.transform(X_val)

---

<center>
    
## Using best model on validation dataset

</center>

---

### Loading the best model

In [5]:
# Load the model and preprocessors (if saved)
model = joblib.load("best_model.joblib")
converter = joblib.load("best_model_converter.joblib")
scaler = joblib.load("best_model_scaler.joblib")

### Converting to the validation dataset to the model's features 

In [None]:
new_data = converter.transform(X_val_scaled)

### Applying model's scaling to the validation dataset

In [None]:
new_data_scaled = scaler.transform(new_data)

### Using the model to make a prediction on the validation dataset

In [None]:
y_pred = model.predict(new_data_scaled)

Predicted price: [2221.98549831 1461.05463461 2715.19088823 2269.74765864 1845.46210163]


---

<center>
    
## Evaluating best model's performance

</center>

---

In [25]:
print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_val, y_pred)))


Model accuracy score: 0.7617


In [26]:
# Compare train and test scores
print('Test set score: {:.4f}'.format(model.score(new_data_scaled, y_val)))

Training set score: 0.8653
Test set score: 0.7617


In [27]:
# Confusion Matrix

cm = confusion_matrix(y_val, y_pred)

print('Confusion matrix\n\n', cm)

print('\nTrue Positives(TP) = ', cm[0,0])

print('\nTrue Negatives(TN) = ', cm[1,1])

print('\nFalse Positives(FP) = ', cm[0,1])

print('\nFalse Negatives(FN) = ', cm[1,0])

Confusion matrix

 [[433  87]
 [ 81 104]]

True Positives(TP) =  433

True Negatives(TN) =  104

False Positives(FP) =  87

False Negatives(FN) =  81


In [28]:
# Classification report

print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.83      0.84       520
           1       0.54      0.56      0.55       185

    accuracy                           0.76       705
   macro avg       0.69      0.70      0.70       705
weighted avg       0.76      0.76      0.76       705



---

<center>
    
##  Best model's performance

</center>

---