In [3]:
import warnings 
warnings.filterwarnings('ignore')

# Data handling
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np

# Feature Processing (Scikit-learn processing, etc. )
from sklearn.metrics import mutual_info_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Machine Learning (Scikit-learn Estimators, Catboost, LightGBM, etc. )
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import KFold, cross_val_score

# Hyperparameters Fine-tuning (Scikit-learn hp search, cross-validation, etc. )
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix

# Other packages
import os, joblib

## Feature Processing & Engineering
Here is the section to **clean**, **process** the dataset and **create new features**.

In [4]:
Data_All = pd.read_csv(r'Dataset\Train_Data.csv')
Data_All.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5034 entries, 0 to 5033
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            5034 non-null   object 
 1   SeniorCitizen     5034 non-null   object 
 2   Partner           5034 non-null   object 
 3   Dependents        5034 non-null   object 
 4   tenure            5034 non-null   int64  
 5   PhoneService      5034 non-null   object 
 6   MultipleLines     5034 non-null   object 
 7   InternetService   5034 non-null   object 
 8   OnlineSecurity    5034 non-null   object 
 9   OnlineBackup      5034 non-null   object 
 10  DeviceProtection  5034 non-null   object 
 11  TechSupport       5034 non-null   object 
 12  StreamingTV       5034 non-null   object 
 13  StreamingMovies   5034 non-null   object 
 14  Contract          5034 non-null   object 
 15  PaperlessBilling  5034 non-null   object 
 16  PaymentMethod     5034 non-null   object 


In [5]:
Data_All.describe()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges
count,5034.0,5034.0,5034.0
mean,32.62058,65.107251,2300.954758
std,24.511015,30.068019,2268.346402
min,1.0,18.4,18.799999
25%,9.0,35.799999,417.662498
50%,29.0,70.599998,1401.0
75%,56.0,90.050003,3860.599976
max,72.0,118.650002,8670.1


In [6]:
Data_All.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,No,Yes,No,1,No,No,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,No,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.950001,1889.5,No
2,Male,No,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.849998,108.150002,Yes
3,Male,No,No,No,45,No,No,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.299999,1840.75,No
4,Female,No,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.699997,151.649994,Yes


In [7]:
# check unique values of each column
for column in Data_All.columns:
    print('Column: {} - Unique Values: {}'.format(column, Data_All[column].unique()))

Column: gender - Unique Values: ['Female' 'Male']
Column: SeniorCitizen - Unique Values: ['No' 'Yes']
Column: Partner - Unique Values: ['Yes' 'No']
Column: Dependents - Unique Values: ['No' 'Yes']
Column: tenure - Unique Values: [ 1 34  2 45  8 22 10 28 62 13 16 58 49 25 69 52 71 21 12 30 47 72 17 27
  5 46 11 70 63 43 15 60 18 66  9  3 31 50 64 56  7 42 35 48 29 65 38 68
 32 55 37 36 41  6  4 33 67 23 57 61 14 20 53 40 59 24 44 19 54 51 26 39]
Column: PhoneService - Unique Values: ['No' 'Yes']
Column: MultipleLines - Unique Values: ['No' 'Yes' 'No phone service']
Column: InternetService - Unique Values: ['DSL' 'Fiber optic' 'No']
Column: OnlineSecurity - Unique Values: ['No' 'Yes' 'No internet service']
Column: OnlineBackup - Unique Values: ['Yes' 'No' 'No internet service']
Column: DeviceProtection - Unique Values: ['No' 'Yes' 'No internet service']
Column: TechSupport - Unique Values: ['No' 'Yes' 'No internet service']
Column: StreamingTV - Unique Values: ['No' 'Yes' 'No internet se

In [8]:
# Lets find out feature dependency on the target variable using mutual information score

x_cat = Data_All.select_dtypes(include=object).drop('Churn', axis=1)
y_cat = Data_All['Churn']

mi_scores = []

#  loop to calculate the Mutual Information Score for each categorical feature 
#  with respect to the 'Churn' target variable

for column in x_cat.columns:
    mi_score = mutual_info_score(x_cat[column], y_cat)
    mi_scores.append((column, mi_score))

# sort features by their importance dependency on the target variable, 
# with the most important ones at the top.

mi_scores.sort(key=lambda x: x[1], reverse=True)

for feature, score in mi_scores:
    print(f"Feature: {feature}, Mutual Information Score: {score}")

Feature: Contract, Mutual Information Score: 0.10284790516675524
Feature: InternetService, Mutual Information Score: 0.05944428811167132
Feature: PaymentMethod, Mutual Information Score: 0.04163984913943765
Feature: TechSupport, Mutual Information Score: 0.030007109249305178
Feature: OnlineSecurity, Mutual Information Score: 0.029670157336360276
Feature: PaperlessBilling, Mutual Information Score: 0.019508874345666505
Feature: OnlineBackup, Mutual Information Score: 0.015826778687180174
Feature: Dependents, Mutual Information Score: 0.015615368107435551
Feature: DeviceProtection, Mutual Information Score: 0.013717699147021103
Feature: StreamingMovies, Mutual Information Score: 0.013349078621584257
Feature: SeniorCitizen, Mutual Information Score: 0.010389723457655958
Feature: Partner, Mutual Information Score: 0.010018147440621322
Feature: StreamingTV, Mutual Information Score: 0.009953736504141789
Feature: MultipleLines, Mutual Information Score: 0.0008281912846177519
Feature: PhoneSe

In [9]:
Data_All.drop(columns=['gender','PhoneService', 'MultipleLines'], inplace=True)

### Dataset Splitting

In [10]:
X = Data_All.drop(columns=['Churn'])
y = Data_All['Churn']

In [11]:
(X.shape, y.shape)

((5034, 16), (5034,))

### Label Encoding

In [12]:
# Encode the target variable (Churn) to have 0 or 1 instead of No or Yes

labelEncoder = LabelEncoder()

y = labelEncoder.fit_transform(y)

### Features encoding & scaling

In [13]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5034 entries, 0 to 5033
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   SeniorCitizen     5034 non-null   object 
 1   Partner           5034 non-null   object 
 2   Dependents        5034 non-null   object 
 3   tenure            5034 non-null   int64  
 4   InternetService   5034 non-null   object 
 5   OnlineSecurity    5034 non-null   object 
 6   OnlineBackup      5034 non-null   object 
 7   DeviceProtection  5034 non-null   object 
 8   TechSupport       5034 non-null   object 
 9   StreamingTV       5034 non-null   object 
 10  StreamingMovies   5034 non-null   object 
 11  Contract          5034 non-null   object 
 12  PaperlessBilling  5034 non-null   object 
 13  PaymentMethod     5034 non-null   object 
 14  MonthlyCharges    5034 non-null   float64
 15  TotalCharges      5034 non-null   float64
dtypes: float64(2), int64(1), object(13)
memory

In [14]:
# Identify numeric and non-numeric columns
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

In [15]:
num_cols

['tenure', 'MonthlyCharges', 'TotalCharges']

In [16]:
cat_cols

['SeniorCitizen',
 'Partner',
 'Dependents',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod']

In [17]:
# Define transformers for numerical and categorical columns
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ])

# Create the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Fit and transform the data
X_processed = pipeline.fit_transform(X)

# Extracting feature names for numerical columns
num_feature_names = num_cols

# Extracting feature names for categorical columns after one-hot encoding
cat_encoder = pipeline.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot']
cat_feature_names = cat_encoder.get_feature_names_out(cat_cols)

# Concatenating numerical and categorical feature names
feature_names = num_feature_names + list(cat_feature_names)

# Convert X_processed to DataFrame
X_processed_df = pd.DataFrame(X_processed, columns=feature_names)

# Display DataFrame
X_processed_df

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,SeniorCitizen_No,SeniorCitizen_Yes,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,InternetService_DSL,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No,DeviceProtection_No internet service,DeviceProtection_Yes,TechSupport_No,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,-1.290184,-1.172700,-1.001315,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.056283,-0.271320,-0.181408,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,-1.249382,-0.374430,-0.966794,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0.505106,-0.758597,-0.202901,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,-1.249382,0.186022,-0.947615,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5029,-0.351737,0.655005,-0.136878,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
5030,1.606761,1.267012,2.231779,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
5031,-0.882164,-1.181015,-0.861729,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
5032,-1.167778,0.309088,-0.879298,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


## Train set Balancing (SMOTE Algorithm)

SMOTE (Synthetic Minority Over-sampling Technique) is a method used to address class imbalance in a binary classification problem. 

Earlier we realised that our target vaiable has a class imbalance. One class (the minority class) has significantly fewer instances than the other class (the majority class). This imbalance can negatively impact the performance of machine learning models, as they might become biased toward the majority class.

SMOTE will aim to balance the class distribution by generating synthetic samples until the minority class has the same number of instances as the majority class. By creating synthetic samples, SMOTE helps the model better capture the patterns in the minority class and prevents it from favoring the majority class due to the imbalance. 

In [18]:
# apply SMOTE to the training data (oversampling)

smote = SMOTE(random_state=42, k_neighbors=5, sampling_strategy='auto')

X_resampled, y_resampled = smote.fit_resample(X_processed_df, y) ############

#### Train-test split

In [19]:
# Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

In [20]:
# check shape after resampling

pd.DataFrame(X_train).shape, pd.DataFrame(y_train).shape

((5916, 39), (5916, 1))

In [21]:
# view class distribution

pd.value_counts(pd.Series(y_train))

0    2958
1    2958
Name: count, dtype: int64

Our train dataset is now balaced

## Machine Learning Modeling 
Here is the section to **build**, **train**, **evaluate** and **compare** the models to each others.

### Model 1. Logistic Regression Model

#### Create the Model

In [22]:
LR = LogisticRegression(random_state=42)

#### Train the Model

In [23]:
LR.fit(X_train, y_train)

### Model 2. K-nearest Neighbors

#### Create the Model

In [24]:
knn = KNeighborsClassifier()

#### Train the Model

In [25]:
knn.fit(X_train, y_train)

### Model 3. Random Forest Classifier

#### Create the Model

In [26]:
rfm = RandomForestClassifier(random_state=42)

#### Train the Model

In [27]:
rfm.fit(X_train, y_train)

### Model 4. Support Vector Machines

#### Create the Model

In [28]:
svm = SVC(random_state=42)

#### Train the Model

In [29]:
svm.fit(X_train, y_train)

### Model 5. Gradient Boosting

#### Create the Model

In [30]:
gb = GradientBoostingClassifier(random_state=42)

#### Train the Model

In [31]:
gb.fit(X_train, y_train)

### Model 6. XGBoost

#### Create the Model

In [32]:
xgb = XGBClassifier(random_state=42)

#### Train the Model

In [33]:
xgb.fit(X_train, y_train)

## Model Evaluation
We create a pandas dataframe that will allow us to compare our models.

#### K-Fold Cross-Validation

k-fold cross-validation  estimates the performance on our models across multiple subsets of the data (k-folds), providing a comprehensive evaluation of their generalization ability. The model is trained and evaluated k times, with each fold serving as the validation set once. This process helps estimate the model's performance across different subsets of the data.

It helps estimate how well a model will perform on new, unseen data and provides insights into its stability and consistency.

In [34]:
# Create a dataframe with the K-fold Cross-Validation results

models = [
    ('Logistic Regression', LR),
    ('Random Forest', rfm),
    ('SVM', svm),
    ('Gradient Boosting', gb),
    ('XGBoost', xgb)
]

# number of k-folds
k = 5

results = []

for name, model in models:
    kf = KFold(n_splits=k, shuffle=True, random_state=42)  # Create a KFold object
    scores = cross_val_score(model, X_train, y_train, cv=kf, scoring='accuracy')

    # Append results to the list
    results.append((name, scores.mean(), scores.std()))

results_df = pd.DataFrame(results, columns=['Model', 'Mean Accuracy', 'Std Deviation'])

results_df.sort_values(by='Mean Accuracy', ascending=False)

Unnamed: 0,Model,Mean Accuracy,Std Deviation
1,Random Forest,0.849899,0.012651
4,XGBoost,0.844658,0.005607
3,Gradient Boosting,0.839587,0.014327
2,SVM,0.806117,0.00933
0,Logistic Regression,0.77569,0.0135


The output of our k-fold cross-validation is the mean accuracy and std deviation.

1. **Average Accuracy** is the mean across all k folds during the cross-validation process. Higher mean accuracy values indicate better predictive performance.
2. **Standard Deviation** measures the variability or spread of accuracy values across the k folds. A lower standard deviation suggests that the model's performance is consistent across different subsets of the data (folds), while a higher standard deviation indicates that the model's performance varies more widely. Smaller standard deviations are generally desirable because they indicate a more stable model.

The **Random Forest** model has the highest mean accuracy (0.8507) **85%** among the evaluated models. This means that, on average, the model correctly predicted the target variable for about 85% of the data points in each fold. It performs well on average across different folds, and it has a relatively low standard deviation (0.0087), indicating consistent performance.

#### Classification Report

In [35]:
model_names = ['Logistic Regression', 'k-NN', 'Random Forest', 'SVM', 'Gradient Boosting', 'XGBoost']
models = [LR, knn, rfm, svm, gb, xgb] # our trained models
model_names_list = []
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

# Loop through each model to calculate metrics and store information
for name, model in zip(model_names, models):
    # Make predictions on the test data
    y_pred = model.predict(X_test)
    
    # Calculate accuracy, precision, recall, and F1-score
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Store model name and metrics
    model_names_list.append(name)
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)

# Create a DataFrame with the calculated metrics
metrics_df = pd.DataFrame({
    'Model': model_names_list,
    'Accuracy': accuracy_scores,
    'Precision': precision_scores,
    'Recall': recall_scores,
    'F1-Score': f1_scores
})

# Display the DataFrame
metrics_df.sort_values(by='Accuracy', ascending=False)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score
2,Random Forest,0.860135,0.852048,0.871622,0.861723
5,XGBoost,0.852703,0.85462,0.85,0.852304
4,Gradient Boosting,0.841216,0.824134,0.867568,0.845293
3,SVM,0.798649,0.77904,0.833784,0.805483
1,k-NN,0.791892,0.734783,0.913514,0.814458
0,Logistic Regression,0.777703,0.754647,0.822973,0.78733


- The random forest model is our highest performing model with an accuracy of 0.860135 / **86%**.

**Accuracy**: Accuracy is a measure of the overall correctness of predictions made by the model. It indicates the proportion of correctly classified instances out of the total number of instances.

**Precision**: Precision is a metric that measures the proportion of true positive predictions (*correctly predicted positive instances*) out of all instances predicted as positive. It assesses the model's ability to avoid false positives.

**Recall**: Recall, also known as *sensitivity* or true positive rate, measures the proportion of true positive predictions out of all actual positive instances. It assesses the model's ability to capture all positive instances.

**F1-Score**: The F1-Score is the harmonic mean of precision and recall. It provides a balanced measure that considers both false positives and false negatives. It is particularly useful when dealing with imbalanced datasets.

- Our top 3 models (Random Forest, XGBoost and Gradient Boosting) are all tree based models, specifically ensemble learning techniques that combine multiple individual trees to improve overall performance and robustness. They reduce overfitting by averaging or boosting the individual trees' predictions. They offer a combination of powerful features that make them robust, accurate, and versatile for classification tasks across a wide range of domains and data characteristics.

## Hyperparameters tuning 

We will Fine-tune our top model using a ` RandomizedSearchCV`  (that is in sklearn.model_selection
) to find the best hyperparameters and achieve the maximum performance of the model

#### 1. Tuning Model 1 (Random Forest)

In [36]:
# Check current model parameters

current_params = rfm.get_params()
current_params

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [37]:
# Random Forest tuning

# Define the parameter distributions for hyperparameter tuning
param_grid = {
  'n_estimators': [20, 50, 100, 200, 300],
  'max_depth': [None, 10, 15, 20, 25],
  'min_samples_split': [2, 3, 4, 5, 6],
  'min_samples_leaf': [1, 2, 3, 4, 5],
  'class_weight': ['balanced', None],
  'max_features': ['auto', 'sqrt', 'log2'],
  'criterion': ['gini', 'entropy']
}

# Initialize RandomizedSearchCV with the RandomForestClassifier model and parameter distributions
random_search_rf = RandomizedSearchCV(estimator=rfm, param_distributions=param_grid, 
                                      scoring='accuracy', n_iter=150, random_state=42,
                                      cv=5, n_jobs=-1, verbose = 1)

# fit best estimator on train data
random_search_rf.fit(X_train, y_train)

# best parameters
best_params = random_search_rf.best_params_

# mean accuracy score of the best estimator
best_score = random_search_rf.best_score_

best_params

Fitting 5 folds for each of 150 candidates, totalling 750 fits


{'n_estimators': 100,
 'min_samples_split': 4,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 20,
 'criterion': 'entropy',
 'class_weight': None}

In [38]:
# mean accuracy score of the best estimator

best_score

0.8519272177926023

In [39]:
# Fit tuned model on train data

tuned_rf_model = random_search_rf.best_estimator_
tuned_rf_model.fit(X_train, y_train)

# make the predictions
random_search_rf_pred = tuned_rf_model.predict(X_test)
original_rf_model = rfm.predict(X_test)

Let's compare the tuned model and the original model performance

In [40]:
# Calculate the classification report

report = classification_report(y_test, original_rf_model, output_dict=True)
report_2 = classification_report(y_test, random_search_rf_pred, output_dict=True)

# Extract precision, recall, f1-score, and accuracy metrics for both classes

precision = ((report['1']['precision'] + report['0']['precision'])) / 2
recall = ((report['1']['recall'] + report['0']['recall'])) / 2
f1 = ((report['1']['f1-score'] + report['0']['f1-score'])) / 2

precision2 = ((report_2['1']['precision'] + report_2['0']['precision'])) / 2
recall2 = ((report_2['1']['recall'] + report_2['0']['recall'])) / 2
f12 = ((report_2['1']['f1-score'] + report_2['0']['f1-score'])) / 2

# Create dictionaries for metrics
metrics_original_Random_Forest = {
    'Total Precision': precision,
    'Total Recall': recall,
    'Total F1-Score': f1,
    'Accuracy': report['accuracy']
}

metrics_Tuned_Random_Forest = {
    'Total Precision': precision2,
    'Total Recall': recall2,
    'Total F1-Score': f12,
    'Accuracy': report_2['accuracy']
}

# Create DataFrames from the metrics dictionaries
metrics_df_original = pd.DataFrame(metrics_original_Random_Forest, index=['Original Random Forest'])
metrics_df_tuned = pd.DataFrame(metrics_Tuned_Random_Forest, index=['Tuned Random Forest'])

# Concatenate the DataFrames vertically to combine the metrics
combined_metrics_df = pd.concat([metrics_df_original, metrics_df_tuned])

combined_metrics_df

Unnamed: 0,Total Precision,Total Recall,Total F1-Score,Accuracy
Original Random Forest,0.860325,0.860135,0.860117,0.860135
Tuned Random Forest,0.858779,0.858108,0.858042,0.858108


In [41]:
# construct the confusion matrix for the best model
confusion_matrix_rf = confusion_matrix(y_test, original_rf_model)

confusion_matrix_rf

array([[628, 112],
       [ 95, 645]], dtype=int64)

- 628 instances were correctly classified as True Negatives (TN).
- 645 instances were correctly classified as True Positives (TP).
- 112 instances were classified as False Positives (FP).
- 95 instances were classified as False Negatives (FN).

## Export key components
Here is the section to **export** the important ML objects that will be use to develop an app: *Encoder, Scaler, ColumnTransformer, Model, Pipeline, etc*.

In [42]:
destination = "toolkit"

# Create a directory if it doesn't exist
if not os.path.exists(destination):
    os.makedirs(destination)

# Create a dictionary to store the objects and their filenames
models = {
    "pipeline": pipeline,
    "model": rfm
}

# Loop through the models and save them using joblib.dump()
for name, model in models.items():
    file_path = os.path.join(destination, f"{name}.joblib")
    joblib.dump(model, file_path)
