In [1]:
!pip install catboost
!pip install imbalanced-learn

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [2]:
!pip install dask[dataframe]

Collecting dask-expr<1.2,>=1.1 (from dask[dataframe])
  Downloading dask_expr-1.1.20-py3-none-any.whl.metadata (2.6 kB)
INFO: pip is looking at multiple versions of dask-expr to determine which version is compatible with other requirements. This could take a while.
  Downloading dask_expr-1.1.19-py3-none-any.whl.metadata (2.6 kB)
  Downloading dask_expr-1.1.18-py3-none-any.whl.metadata (2.6 kB)
  Downloading dask_expr-1.1.16-py3-none-any.whl.metadata (2.5 kB)
Downloading dask_expr-1.1.16-py3-none-any.whl (243 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m243.2/243.2 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dask-expr
Successfully installed dask-expr-1.1.16


In [3]:
import os
import sys
import pandas as pd
import numpy as np
import seaborn as ns
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook

### Feature Transformation Related Methods ###
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, ClassifierMixin
from imblearn.combine import SMOTEENN, SMOTETomek


### MachineLearning Models ###
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

import warnings
warnings.filterwarnings('ignore')

In [4]:
### Loading Csv Data in Dataframe ###

data = pd.read_csv("/content/Telco_Customer_Churn.csv")

### Printing Head ###

data.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [5]:
### TotalCharges Change Object to Float64 ###

data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors = 'coerce')

### Checking Null Values ###

data.isnull().sum()


Unnamed: 0,0
customerID,0
gender,0
SeniorCitizen,0
Partner,0
Dependents,0
tenure,0
PhoneService,0
MultipleLines,0
InternetService,0
OnlineSecurity,0


In [6]:
from sklearn.impute import SimpleImputer

# Create an imputer
imputer = SimpleImputer(strategy='mean')  # Replace 'mean' with 'median' or 'most_frequent' if needed
data['TotalCharges'] = imputer.fit_transform(data[['TotalCharges']])


In [7]:
data.isnull().sum()


Unnamed: 0,0
customerID,0
gender,0
SeniorCitizen,0
Partner,0
Dependents,0
tenure,0
PhoneService,0
MultipleLines,0
InternetService,0
OnlineSecurity,0


In [8]:
### Checking Duplicat Values ###

data.duplicated().sum()

0

In [9]:
### Remove customerID in DataFrame ###

data.drop('customerID', axis = 1, inplace = True)

In [10]:
### Checking data head customerID remove or not ###
data.head(2)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No


In [11]:
### X variable features ###

X = data.drop('Churn', axis = 1)

### Y variable target feature ###

y = data['Churn']

In [12]:
### Checking X and y shape
X.shape, y.shape

((7043, 19), (7043,))

### **1 Model Training With all Features** ###

In [13]:
### Data Transformation in Pipline and ColumnTransformer ###

### Spliting Numeric Features and Categorical Features ###

numeric = X.select_dtypes(include = 'number').columns.tolist()
categorical = X.select_dtypes(include = 'object').columns.tolist()

num_pipline = Pipeline([
    ('imputer', SimpleImputer(strategy = 'median')),#Fills in missing values using the median
    ('scaler', StandardScaler())#Standardizes the values to have a mean of 0 and standard deviation of 1.
])

cat_pipline = Pipeline([
    ('encoder', OrdinalEncoder())#Encodes categories into numerical values using OrdinalEncoder.
])

preprosser = ColumnTransformer([
    ('numeric', num_pipline, numeric),#Merges both pipelines (numeric and categorical) into a single ColumnTransformer.
    ('categorical', cat_pipline, categorical)#Ensures the correct transformations are applied to the corresponding feature types.
])
preprosser

In [14]:
## Fiting Preprosser Object in X Features ###

X_pre_transformed = preprosser.fit_transform(X)

### Checking X_pre_transformed ###

X_pre_transformed.shape

(7043, 19)

In [15]:
### Printing X_pre_transformed Values ###

X_pre_transformed

array([[-0.43991649, -1.27744458, -1.16032292, ...,  0.        ,
         1.        ,  2.        ],
       [-0.43991649,  0.06632742, -0.25962894, ...,  1.        ,
         0.        ,  3.        ],
       [-0.43991649, -1.23672422, -0.36266036, ...,  0.        ,
         1.        ,  3.        ],
       ...,
       [-0.43991649, -0.87024095, -1.1686319 , ...,  0.        ,
         1.        ,  2.        ],
       [ 2.27315869, -1.15528349,  0.32033821, ...,  0.        ,
         1.        ,  3.        ],
       [-0.43991649,  1.36937906,  1.35896134, ...,  2.        ,
         1.        ,  0.        ]])

In [16]:
### y target Value Encoding ###

le = LabelEncoder()
y_encoded = le.fit_transform(y)
y_encoded

### Checking y_encoded Shape ###
y_encoded.shape

(7043,)

In [17]:
smt = SMOTEENN()
X_resampled, y_resampled = smt.fit_resample(X_pre_transformed, y_encoded)
X_resampled.shape, y_resampled.shape

((6362, 19), (6362,))

In [18]:
### Evaluation Metrics ###
def evaluate_clf(true, predicted):
    acc = accuracy_score(true, predicted)
    f1 = f1_score(true, predicted)
    precision = precision_score(true, predicted)
    recall = recall_score(true, predicted)
    roc_auc = roc_auc_score(true, predicted)
    return acc, f1 , precision, recall, roc_auc

In [37]:


def evaluate_models(X, y, models, params):


    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

    models_list = []
    accuracy_list = []
    f1_list = []
    precision_list = []
    recall_list = []
    roc_auc_list = []

    for i in tqdm_notebook(range(len(list(models)))):
        model = list(models.values())[i]
        para=params[list(models.keys())[i]]
        model.fit(X_train, y_train) # Train model

        gs = GridSearchCV(model,para,cv=3)
        gs.fit(X_train,y_train)

        model.set_params(**gs.best_params_)
        model.fit(X_train,y_train)

        # Make predictions
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        # Training set performance
        model_train_accuracy, model_train_f1,model_train_precision,\
        model_train_recall,model_train_rocauc_score=evaluate_clf(y_train ,y_train_pred)


        # Test set performance
        model_test_accuracy,model_test_f1,model_test_precision,\
        model_test_recall,model_test_rocauc_score=evaluate_clf(y_test, y_test_pred)

        print(list(models.keys())[i])
        models_list.append(list(models.keys())[i])

        print('Model performance for Training set')
        print("- Accuracy: {:.4f}".format(model_train_accuracy))
        print('- F1 score: {:.4f}'.format(model_train_f1))
        print('- Precision: {:.4f}'.format(model_train_precision))
        print('- Recall: {:.4f}'.format(model_train_recall))
        print('- Roc Auc Score: {:.4f}'.format(model_train_rocauc_score))

        print('----------------------------------')

        print('Model performance for Test set')
        print('- Accuracy: {:.4f}'.format(model_test_accuracy))
        accuracy_list.append(model_test_accuracy)
        print('- F1 score: {:.4f}'.format(model_test_f1))
        f1_list.append(model_test_f1) # Append f1 score to the list
        print('- Precision: {:.4f}'.format(model_test_precision))
        precision_list.append(model_test_precision) # Append precision to the list
        print('- Recall: {:.4f}'.format(model_test_recall))
        recall_list.append(model_test_recall) # Append recall to the list
        print('- Roc Auc Score: {:.4f}'.format(model_test_rocauc_score))
        roc_auc_list.append(model_test_rocauc_score) # Append roc_auc_score to the list
        print('='*35)
        print('\n')

    # Create the report DataFrame using the lists of calculated values
    report = pd.DataFrame(list(zip(models_list, accuracy_list, f1_list, precision_list, recall_list, roc_auc_list)),columns=['Model Name','Accuracy','F1 score','Precision','Recall','Roc Auc Score']).sort_values(by=['Accuracy'], ascending=False)
    return report


In [38]:
### Define models ###

models = {
    'LogisticRegression': LogisticRegression(),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'SVC': SVC(),
    'RandomForestClassifier': RandomForestClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier(),
    'AdaBoostClassifier': AdaBoostClassifier(),
    'XGBClassifier': XGBClassifier(),
    'LGBMClassifier': LGBMClassifier(),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'GaussianNB': GaussianNB()
}

### Define Params ###

param_grids = {
    'LogisticRegression': {
        "class_weight":["balanced"],
        'penalty': ['l1', 'l2'],
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'solver': ['liblinear', 'saga']
    },
    'KNeighborsClassifier': {
        'n_neighbors': [3, 5, 7, 9]
    },
    'SVC': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf', 'poly'],
        'gamma': ['scale', 'auto']
    },
    'RandomForestClassifier': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    },
    'GradientBoostingClassifier': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5],
        'min_samples_split': [2]
    },
    'AdaBoostClassifier': {
        'n_estimators': [50, 100],
        'learning_rate': [0.01, 0.1]
    },
    'DecisionTreeClassifier': {
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2]
    },
    'GaussianNB': {},
    'XGBClassifier': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 6]
    },
    'LGBMClassifier': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'num_leaves': [31, 63]
    }
}

In [39]:
report = evaluate_models(X_resampled, y_resampled, models, param_grids)

  0%|          | 0/10 [00:00<?, ?it/s]

LogisticRegression
Model performance for Training set
- Accuracy: 0.9114
- F1 score: 0.9196
- Precision: 0.9274
- Recall: 0.9120
- Roc Auc Score: 0.9113
----------------------------------
Model performance for Test set
- Accuracy: 0.9081
- F1 score: 0.9174
- Precision: 0.9168
- Recall: 0.9181
- Roc Auc Score: 0.9068


KNeighborsClassifier
Model performance for Training set
- Accuracy: 0.9882
- F1 score: 0.9895
- Precision: 0.9822
- Recall: 0.9968
- Roc Auc Score: 0.9871
----------------------------------
Model performance for Test set
- Accuracy: 0.9654
- F1 score: 0.9697
- Precision: 0.9450
- Recall: 0.9958
- Roc Auc Score: 0.9616


SVC
Model performance for Training set
- Accuracy: 0.9857
- F1 score: 0.9872
- Precision: 0.9805
- Recall: 0.9940
- Roc Auc Score: 0.9846
----------------------------------
Model performance for Test set
- Accuracy: 0.9623
- F1 score: 0.9668
- Precision: 0.9484
- Recall: 0.9859
- Roc Auc Score: 0.9593


RandomForestClassifier
Model performance for Training

In [40]:
report

Unnamed: 0,Model Name,Accuracy,F1 score,Precision,Recall,Roc Auc Score
7,LGBMClassifier,0.969364,0.972593,0.967832,0.977401,0.968347
1,KNeighborsClassifier,0.965436,0.969739,0.94504,0.995763,0.961598
4,GradientBoostingClassifier,0.96465,0.968509,0.959778,0.977401,0.963037
6,XGBClassifier,0.963865,0.967742,0.961003,0.974576,0.962509
3,RandomForestClassifier,0.963079,0.96711,0.958391,0.975989,0.961446
2,SVC,0.962294,0.966759,0.94837,0.985876,0.95931
8,DecisionTreeClassifier,0.937942,0.944248,0.943583,0.944915,0.937059
5,AdaBoostClassifier,0.914375,0.925086,0.900937,0.950565,0.909796
0,LogisticRegression,0.908091,0.917431,0.916784,0.918079,0.906827
9,GaussianNB,0.897093,0.905552,0.92489,0.887006,0.89837


### **2. Model Training With 6 features** ###

In [49]:
X_2 = X[['gender', 'InternetService', 'Contract', 'tenure', 'MonthlyCharges', 'TotalCharges']]
y_2 =y

In [50]:
X_2.head(3)

Unnamed: 0,gender,InternetService,Contract,tenure,MonthlyCharges,TotalCharges
0,Female,DSL,Month-to-month,1,29.85,29.85
1,Male,DSL,One year,34,56.95,1889.5
2,Male,DSL,Month-to-month,2,53.85,108.15


In [51]:
y_2.head(3)

Unnamed: 0,Churn
0,No
1,No
2,Yes


In [52]:
### Data Transformation in Pipline and ColumnTransformer ###

### Spliting Numeric Features and Categorical Features ###

numeric_features = X_2.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_2.select_dtypes(include="object").columns.tolist()
### Numeric Pipline ###

num_pipline_1 = Pipeline([
    ('imputer', SimpleImputer(strategy = 'median')),
    ('scaler', StandardScaler())
])

### Categorical Pipline ###

cat_pipline_1 = Pipeline([
    ('encoder', OrdinalEncoder())
])

### ColumnTransformer Object ###
preprosser_1 = ColumnTransformer([
    ('numeric', num_pipline_1,numeric_features),
    ('categorical', cat_pipline_1,categorical_features)
])
preprosser_1

In [None]:
import pickle

In [68]:
pickle.dump(preprosser_1,open('preprosser_4.pkl','wb'))

In [53]:
## Fiting Preprosser Object in X Features ###

X_pre_transformed_1 = preprosser_1.fit_transform(X_2)

### Checking X_pre_transformed ###

X_pre_transformed_1.shape

(7043, 6)

In [54]:
### Printing X_pre_transformed Values ###

X_pre_transformed_1

array([[-1.27744458, -1.16032292, -0.99497138,  0.        ,  0.        ,
         0.        ],
       [ 0.06632742, -0.25962894, -0.17387565,  1.        ,  0.        ,
         1.        ],
       [-1.23672422, -0.36266036, -0.96039939,  1.        ,  0.        ,
         0.        ],
       ...,
       [-0.87024095, -1.1686319 , -0.85518222,  0.        ,  0.        ,
         0.        ],
       [-1.15528349,  0.32033821, -0.87277729,  1.        ,  1.        ,
         0.        ],
       [ 1.36937906,  1.35896134,  2.01391739,  1.        ,  1.        ,
         2.        ]])

In [55]:
### y target Value Encoding ###

le = LabelEncoder()
y_encoded_1 = le.fit_transform(y_2)
y_encoded_1

### Checking y_encoded Shape ###
#y_encoded.shape

array([0, 0, 1, ..., 0, 1, 0])

In [56]:
smt_1 = SMOTEENN()
X_resampled_1, y_resampled_1 = smt_1.fit_resample(X_pre_transformed_1, y_encoded_1)
X_resampled_1.shape, y_resampled_1.shape

((6136, 6), (6136,))

In [57]:
report_2 = evaluate_models(X_resampled_1, y_resampled_1, models, param_grids)

  0%|          | 0/10 [00:00<?, ?it/s]

LogisticRegression
Model performance for Training set
- Accuracy: 0.9061
- F1 score: 0.9117
- Precision: 0.8845
- Recall: 0.9407
- Roc Auc Score: 0.9049
----------------------------------
Model performance for Test set
- Accuracy: 0.9031
- F1 score: 0.9056
- Precision: 0.8771
- Recall: 0.9361
- Roc Auc Score: 0.9033


KNeighborsClassifier
Model performance for Training set
- Accuracy: 0.9945
- F1 score: 0.9947
- Precision: 0.9921
- Recall: 0.9972
- Roc Auc Score: 0.9944
----------------------------------
Model performance for Test set
- Accuracy: 0.9837
- F1 score: 0.9837
- Precision: 0.9773
- Recall: 0.9902
- Roc Auc Score: 0.9838


SVC
Model performance for Training set
- Accuracy: 0.9283
- F1 score: 0.9305
- Precision: 0.9298
- Recall: 0.9313
- Roc Auc Score: 0.9282
----------------------------------
Model performance for Test set
- Accuracy: 0.9381
- F1 score: 0.9380
- Precision: 0.9334
- Recall: 0.9426
- Roc Auc Score: 0.9381


RandomForestClassifier
Model performance for Training

In [59]:
report_2

Unnamed: 0,Model Name,Accuracy,F1 score,Precision,Recall,Roc Auc Score
1,KNeighborsClassifier,0.983713,0.983713,0.977346,0.990164,0.983755
3,RandomForestClassifier,0.982085,0.982114,0.974194,0.990164,0.982137
7,LGBMClassifier,0.982085,0.982114,0.974194,0.990164,0.982137
6,XGBClassifier,0.978827,0.97893,0.967949,0.990164,0.978901
4,GradientBoostingClassifier,0.973941,0.973899,0.969156,0.978689,0.973972
8,DecisionTreeClassifier,0.961726,0.961943,0.9504,0.97377,0.961804
2,SVC,0.938111,0.93801,0.933442,0.942623,0.93814
5,AdaBoostClassifier,0.908795,0.910686,0.886646,0.936066,0.908971
0,LogisticRegression,0.903094,0.90563,0.877112,0.936066,0.903308
9,GaussianNB,0.890065,0.894283,0.856072,0.936066,0.890363


In [58]:
report

Unnamed: 0,Model Name,Accuracy,F1 score,Precision,Recall,Roc Auc Score
7,LGBMClassifier,0.969364,0.972593,0.967832,0.977401,0.968347
1,KNeighborsClassifier,0.965436,0.969739,0.94504,0.995763,0.961598
4,GradientBoostingClassifier,0.96465,0.968509,0.959778,0.977401,0.963037
6,XGBClassifier,0.963865,0.967742,0.961003,0.974576,0.962509
3,RandomForestClassifier,0.963079,0.96711,0.958391,0.975989,0.961446
2,SVC,0.962294,0.966759,0.94837,0.985876,0.95931
8,DecisionTreeClassifier,0.937942,0.944248,0.943583,0.944915,0.937059
5,AdaBoostClassifier,0.914375,0.925086,0.900937,0.950565,0.909796
0,LogisticRegression,0.908091,0.917431,0.916784,0.918079,0.906827
9,GaussianNB,0.897093,0.905552,0.92489,0.887006,0.89837


BEST MODEL FOR STREAMLIT

In [60]:
X_resampled2=X_resampled_1
Y_resampled2=y_resampled_1


In [61]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled2,Y_resampled2,test_size=0.2,random_state=42)


In [62]:
model1=KNeighborsClassifier()
param_grid = {
    'n_neighbors': [3, 5, 7, 9]
}

In [63]:
model1.fit(X_train, y_train) # Train model

In [64]:
y_pred=model1.predict(X_test)

In [65]:
accuracy=accuracy_score(y_test,y_pred)
accuracy

0.9739413680781759

In [66]:
import pickle

In [67]:
pickle.dump(model1,open('model4.pkl','wb'))

## **3. MODEL-TRAINING WITH 4-FEATURES**

In [None]:
data.head(2)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No


In [None]:
X_3 = X[['gender','Contract','MonthlyCharges', 'TotalCharges']]
y_3 =y

In [None]:
X_3.head(3)

Unnamed: 0,gender,Contract,MonthlyCharges,TotalCharges
0,Female,Month-to-month,29.85,29.85
1,Male,One year,56.95,1889.5
2,Male,Month-to-month,53.85,108.15


In [None]:
y_3.head(3)

Unnamed: 0,Churn
0,No
1,No
2,Yes


In [None]:
### Data Transformation in Pipline and ColumnTransformer ###

### Spliting Numeric Features and Categorical Features ###

numeric_1 = X_3.select_dtypes(include=np.number).columns.tolist()
categorical_1 = X_3.select_dtypes(include="object").columns.tolist()
### Numeric Pipline ###

num_pipline_1 = Pipeline([
    ('imputer', SimpleImputer(strategy = 'median')),
    ('scaler', StandardScaler())
])

### Categorical Pipline ###

cat_pipline_1 = Pipeline([
    ('encoder', OrdinalEncoder())
])

### ColumnTransformer Object ###
preprosser_2 = ColumnTransformer([
    ('numeric', num_pipline_1,numeric_1),
    ('categorical', cat_pipline_1,categorical_1)
])
preprosser_2

In [None]:
## Fiting Preprosser Object in X Features ###

X_pre_transformed_2 = preprosser_2.fit_transform(X_3)

### Checking X_pre_transformed ###

X_pre_transformed_2.shape

(7043, 4)

In [None]:
### Printing X_pre_transformed Values ###

X_pre_transformed_2

array([[-1.16032292, -0.99497138,  0.        ,  0.        ],
       [-0.25962894, -0.17387565,  1.        ,  1.        ],
       [-0.36266036, -0.96039939,  1.        ,  0.        ],
       ...,
       [-1.1686319 , -0.85518222,  0.        ,  0.        ],
       [ 0.32033821, -0.87277729,  1.        ,  0.        ],
       [ 1.35896134,  2.01391739,  1.        ,  2.        ]])

In [None]:
### y target Value Encoding ###

le = LabelEncoder()
y_encoded_2 = le.fit_transform(y_3)
y_encoded_2

### Checking y_encoded Shape ###
#y_encoded.shape

array([0, 0, 1, ..., 0, 1, 0])

In [None]:
smt_1 = SMOTEENN()
X_resampled_2, y_resampled_2 = smt_1.fit_resample(X_pre_transformed_2, y_encoded_2)
X_resampled_2.shape, y_resampled_2.shape

((6040, 4), (6040,))

In [None]:
report_3 = evaluate_models(X_resampled_2, y_resampled_2, models, param_grids)

  0%|          | 0/10 [00:00<?, ?it/s]

LogisticRegression
Model performance for Training set
- Accuracy: 0.8820
- F1 score: 0.8882
- Precision: 0.8674
- Recall: 0.9100
- Roc Auc Score: 0.8812
----------------------------------
Model performance for Test set
- Accuracy: 0.8940
- F1 score: 0.8919
- Precision: 0.8742
- Recall: 0.9103
- Roc Auc Score: 0.8947


KNeighborsClassifier
Model performance for Training set
- Accuracy: 0.9911
- F1 score: 0.9914
- Precision: 0.9880
- Recall: 0.9948
- Roc Auc Score: 0.9910
----------------------------------
Model performance for Test set
- Accuracy: 0.9685
- F1 score: 0.9671
- Precision: 0.9705
- Recall: 0.9638
- Roc Auc Score: 0.9684


SVC
Model performance for Training set
- Accuracy: 0.9127
- F1 score: 0.9169
- Precision: 0.8991
- Recall: 0.9353
- Roc Auc Score: 0.9120
----------------------------------
Model performance for Test set
- Accuracy: 0.9156
- F1 score: 0.9128
- Precision: 0.9051
- Recall: 0.9207
- Roc Auc Score: 0.9158


RandomForestClassifier
Model performance for Training

In [None]:
report_3

Unnamed: 0,Model Name,Accuracy
3,RandomForestClassifier,0.969115
1,KNeighborsClassifier,0.964107
7,LGBMClassifier,0.961603
8,DecisionTreeClassifier,0.956594
6,XGBClassifier,0.95409
4,GradientBoostingClassifier,0.951586
2,SVC,0.912354
5,AdaBoostClassifier,0.894825
0,LogisticRegression,0.883139
9,GaussianNB,0.854758


In [None]:
report_2

Unnamed: 0,Model Name,Accuracy
3,RandomForestClassifier,0.982941
7,LGBMClassifier,0.978879
1,KNeighborsClassifier,0.974817
8,DecisionTreeClassifier,0.967506
6,XGBClassifier,0.965881
4,GradientBoostingClassifier,0.962632
2,SVC,0.910642
5,AdaBoostClassifier,0.894395
0,LogisticRegression,0.891958
9,GaussianNB,0.880585


In [None]:
report

Unnamed: 0,Model Name,Accuracy
1,KNeighborsClassifier,0.963195
7,LGBMClassifier,0.963195
6,XGBClassifier,0.962412
4,GradientBoostingClassifier,0.961629
3,RandomForestClassifier,0.956147
2,SVC,0.956147
8,DecisionTreeClassifier,0.934221
5,AdaBoostClassifier,0.90603
0,LogisticRegression,0.899765
9,GaussianNB,0.886453
