In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# importing libraries
import numpy as np
import pandas as pd

import seaborn as sns
import plotly.express as px
import matplotlib.ticker as mtick
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

from sklearn.tree import DecisionTreeClassifier

from imblearn.combine import SMOTEENN # imbalanced-learn Python library,

import warnings
warnings.filterwarnings('ignore')

In [None]:
# to get a view of all the columns
pd.set_option("display.max_columns", None)

### About features available:
  #### 1.customerID - customer id
  #### 2. gender - client's gender (male / female)
  #### 3. SeniorCitizen - is the client retired (1-yes, 0-no)
  #### 4. Partner - is the client married (Yes, No)
  #### 5. tenure - how many months a person has been a client of the company
  #### 6. PhoneService - is the telephone service connected (Yes, No)
  #### 7. MultipleLines - are multiple phone lines connected (Yes, No, No phone service)
  #### 8. InternetService - client's Internet service provider (DSL, Fiber optic, No)
  #### 9. OnlineSecurity - is the online security service connected (Yes, No, No internet service)
  #### 10. OnlineBackup - is the online backup service activated (Yes, No, No internet service)
  #### 11. DeviceProtection - does the client have equipment insurance (Yes, No, No internet service)
  #### 12. TechSupport - is the technical support service connected (Yes, No, No internet service)
  #### 13. StreamingTV - is the streaming TV service connected (Yes, No, No internet service)
  #### 14. StreamingMovies - is the streaming cinema service activated (Yes, No, No internet service)
  #### 15. Contract - type of customer contract (Month-to-month, One year, Two year)
  #### 16. PaperlessBilling - whether the client uses paperless billing (Yes, No)
  #### 17. PaymentMethod - payment method (Electronic check, Mailed check, Bank transfer (automatic), Credit card (automatic))
  #### 18. MonthlyCharges - current monthly payment
  #### 19. TotalCharges - the total amount that the client paid for the services for the entire time
  #### 20.**Churn** - whether there was a churn (Yes or No)

In [None]:
# Importing data
dataset = pd.read_csv("../input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv")
dataset.head()

In [None]:
dataset.Churn.value_counts() # pandas.core.series.Series

In [None]:
# Visualization of imbalance data
plt.bar(x = dataset['Churn'].unique(), height = dataset.Churn.value_counts())
plt.legend()
plt.xlabel("Churn label")
plt.ylabel("Count")
plt.show()

In [None]:
plt.pie(dataset.Churn.value_counts(), labels = dataset['Churn'].unique(), explode = [0.1, 0.5])
plt.legend()
plt.show()

##### Observations: The data is clearly imbalanced (More data of non-churners)
##### As Data is imbanced, we shouldn't consider "Accuracy" to measure the performance of the model. Accuracy will give wrong measure, for eg. lets just print "No" for every example, then also majority of times we will be correct.

## Data Description

In [None]:
# more understanding on data
dataset.info()

##### Observations: Most of the columns are of **object** type, with seemingly no missing values

In [None]:
# include='all' helps to describe all types of columns, (both categorical and numerical)
dataset.describe(include='all')

### Observations: using .info() and .describe() method we can observe that although 'TotalCharges' have been considered as object, but in reality it is numeric. So we must further investigate

In [None]:
dataset.TotalCharges.value_counts()
# Observation: There is one ' ' blank value, which has occurred 11 times; making the data type as object instead of float64

In [None]:
dataset[dataset['TotalCharges'] == ' ']
# rows with total Charges as blank(' ')

In [None]:
# We will replace the 11 blank places with 0.0 here
print("Before removing blank values")
print(dataset[dataset['TotalCharges'] == ' '].index) 
# to print those indexes values where variable 'TotalCharges == ' '

# dataset['TotalCharges'] = dataset.TotalCharges.replace(r'^\s*$', 0.0, regex=True) 
# to replace white spaces (any number of white spaces (' ') and then ending with white space) with 0.0
# replacing with corresponding Monthly charges, where TotalCharges has blanks
# df['col1'] = np.where(df['col1'] == ' ', df['col2'], df['col1'])
dataset['TotalCharges'] = np.where(dataset['TotalCharges'] == ' ', dataset['MonthlyCharges'], dataset['TotalCharges'])

print("After removing full blank values")
print(dataset[dataset['TotalCharges'] == ' '].index)

In [None]:
dataset[dataset['TotalCharges'] == dataset['MonthlyCharges']]
# After replacing blanks

#### Converting 'TotalCharges'from categorical to float type

In [None]:
# we need to change the data type of variable 'Totalcharges' to float64
dataset.info() # still object
dataset['TotalCharges'] = dataset['TotalCharges'].astype(float)
dataset.info() # yes now it has changed to float64

# Understanding the Business Problem
### To get an understanding of data and give our analysis some direction we can come up with some basic questions as follows:
#### 1. How gender, partner and dependents are related to churn? 
#### 2. How tenure has an impact on churn?
#### 3. As the dataset is about telecom industry, we need some insights on phone and internet services!
#### 4. Does Tech support have any impact on churn?
#### 5. Is there any relationship of churn with monthly charges or total charges?
#### 6. How is the service for customers who opted for streaming?
#### 7. How contract is impacting business the most?

# Visualization 
#### Mostly using Plotly and Seaborn

### 1. How gender, partner and dependents are related to churn?

In [None]:
# The count for each churn categories in this path
dataset.groupby(['gender', 'Partner', 'Dependents', 'Churn']).size()

In [None]:
# instead of keeping multi index, or merged data, changing it to row-wise data to get a better view
dataset.groupby(['gender', 'Partner', 'Dependents', 'Churn']).size().reset_index(name='Count')

In [None]:
# visualizing the above info using plotly
# Treemap charts visualize hierarchical data using nested rectangles
# all the features here are categorical in nature
fig = px.treemap(dataset.groupby(['gender', 'Partner', 'Dependents', 'Churn']).size().reset_index(name='Count'), 
                 path=['gender', 'Partner', 'Dependents', 'Churn'], values='Count', color='Churn', 
                 title='1. How gender, partner and dependents are related to churn?')

fig.show()

#### Observation: Whether male or female, if they do not have partner or dependents, they are more likely to churn!!

### 2. Does tenure has any impact on churn?

In [None]:
dataset.groupby(['tenure', 'Churn']).size().reset_index(name='count')
# tenure is a numerical variable, and thus will have large number of unique values

In [None]:
# Here, reset_index is from pandas.Series.reset_index
# A rug plot is a plot of data for a single quantitative variable, displayed as marks along an axis. 
# Rug plots are used to visualise the distribution of the data

# A marginal distribution is the percentages out of totals

fig = px.histogram(dataset.groupby(['tenure', 'Churn']).size().reset_index(name='count'), 
                   x="tenure", y="count", color="Churn", marginal="rug", color_discrete_map={"Yes":"#E45756", "No":"#1CBE4F"}, 
                   title="2. Does tenure has any impact on churn?")

fig.show()

# Observation: customers with low tenure (0-10) has the highest rate of churning

##### Observation: During 0-10 years of tenure, we can see maximum churning. As the customer turns old, they might get habituated using same telecom service

### 3. As the dataset is about telecom industry, we need some insights on phone and internet services!

In [None]:
dataset.groupby(['Churn', 'PhoneService', 'InternetService']).size()

In [None]:
# Sunburst plots visualize hierarchical data spanning outwards radially from root to leaves
# the root starts from the center and children are added to the outer rings.

fig = px.sunburst(dataset.groupby(['Churn', 'PhoneService', 'InternetService']).size().reset_index(name='count'), 
                  path = ['Churn', 'PhoneService', 'InternetService'], values = 'count', 
                  title='3. As the dataset is about telecom industry, we need some insights on phone and internet services!')

fig.show()

##### Observations: People with Phone services (yes) and 'Fiber optic' Internet Service are churning more

### 4. Does Tech support have any impact on churn, considering the tenure?

In [None]:
np.unique(dataset.TechSupport)

In [None]:
# Let me take only "Yes" and "No" for TechSupport in consideration
data_techSupport_yes = dataset[dataset['TechSupport'] == 'Yes']
data_techSupport_no = dataset[dataset['TechSupport'] == 'No'] # type == pandas.core.frame.DataFrame
# same as --> dataset.loc[dataset['TechSupport'] == 'No',:]

#### Customers who took tech support

In [None]:
# for each tenure, whether Churned or not, and its corresponding count
data_techSupport_yes.groupby(['tenure', 'Churn']).size()

In [None]:
fig = px.histogram(data_techSupport_yes.groupby(['tenure', 'Churn']).size().reset_index(name='count'), 
                   x="tenure", y="count", marginal="rug", color="Churn", color_discrete_map={"Yes":"#E45756", "No":"#1CBE4F"}, 
                   title="Statistics of customers opted for tech support with churning")
# color="Churn" is vvi to mention, or else it will color 'count'
fig.show()

#### Customers who didn't took tech support

In [None]:
# for each tenure, whether Churned or not, and its corresponding count
data_techSupport_no.groupby(['tenure', 'Churn']).size()

In [None]:
# plotting histogram for customer churning who took no tech support
fig = px.histogram(data_techSupport_no.groupby(['tenure', 'Churn']).size().reset_index(name='count'),
                   x='tenure', y='count',color='Churn', marginal='rug', color_discrete_map={"Yes":"#E45756", "No":"#1CBE4F"}, 
                   title="Statistics of customers opted for tech support with churning")

fig.show()

In [None]:
fig = px.sunburst(dataset.groupby(['Churn', 'TechSupport', 'tenure']).size().reset_index(name='count'), 
                            path=['Churn', 'TechSupport', 'tenure'], values='count', 
                            title='Does customers opted for tech support stayed for longer tenure with less churn?')

fig.show()

##### Observations: more churning takes place in first 10 yrs (max in first year itself), for customers with or without tech support. But Churning is more in case of "without tech support" customers

### 5. Is there any relationship of churn with monthly charges or total charges?

#### About KDE
* kernel density estimate (KDE) plot is a method for visualizing the distribution of observations in a dataset, analagous to a histogram. 
* KDE represents the data using a continuous probability density curve in one or more dimensions.

In [None]:
sns.set(rc={'figure.figsize':(26, 8.27)}) # rc - seems row, column
sns.kdeplot(data = dataset, x="MonthlyCharges", hue="Churn", multiple="stack").set(title="5. Is there any relationship of churn with monthly charges?")

##### Observations: As the monthy charges are incresing, we can see the density increasing too (60-120), which means more churning with increasing monthly charges

In [None]:
sns.set(rc={'figure.figsize':(26,8.27)})
sns.kdeplot(data=dataset, x="TotalCharges", hue="Churn", multiple="stack").set(title="Did customers' total charges relate with churn?")

##### Observation: It is quite opposite of what has been seen for monthly charges. Here high churning occurs when total charges is less,  0-2000 total charges have maximum churning

### 6. Is there any relation between churning and customers who opted for streaming?

In [None]:
dataset.groupby(['Churn', 'StreamingTV']).size()

In [None]:
ax = sns.barplot(x="StreamingTV", y="count", hue="Churn", data = dataset.groupby(['Churn', 'StreamingTV']).size().reset_index(name='count'), 
                 palette="Set2").set(title = "6. Is there any relation between churning and customers who opted for streaming?")

In [None]:
fig = px.sunburst(dataset.groupby(['Churn', 'InternetService', 'StreamingTV']).size().reset_index(name='count'), 
                            path=['Churn', 'InternetService', 'StreamingTV'], values='count', 
                            title='6. Is there any relation between churning and customers who opted for streaming?')

fig.show()

In [None]:
fig = px.sunburst(dataset.groupby(['Churn', 'StreamingTV']).size().reset_index(name='count'), 
                            path=['Churn', 'StreamingTV'], values='count', 
                            title='Do customers opted for streaming, faced issue with the service?')

fig.show()

##### Observation: Churning is being observed equally for the 'Yes', 'No' group of whether connected StreamingTv or not!

In [None]:
dataset.groupby(['Churn', 'StreamingMovies']).size()

In [None]:
# plotting Streaming movies count with Churn to see if there exists any relationship
ax = sns.barplot(x="Churn", y="count", hue="StreamingMovies", 
                 data = dataset.groupby(['Churn', 'StreamingMovies']).size().reset_index(name="count"), palette="Set2").set(title="")

##### Observation: Churning is being observed equally for both the 'Yes', 'No' group of StreamingMovies

### 7. How contract is impacting business?

In [None]:
dataset.groupby(['Churn', 'Contract']).size()

In [None]:
# plotting graph to get better understanding
fig = px.sunburst(dataset.groupby(['Churn', 'Contract']).size().reset_index(name='count'), 
                  path=['Churn', 'Contract'], values='count', 
                  title='7. How contract is impacting business?')

fig.show()

### Observations: clearly visible that customers with month-to-month contract are the highest churners

#### Senior Citizen vs Churning

In [None]:
# plotting graph to get better understanding
# 0 - not a senior citizen
# 1 - senior citizen
fig = px.sunburst(dataset.groupby(['Churn', 'SeniorCitizen']).size().reset_index(name='count'), 
                  path=['Churn', 'SeniorCitizen'], values='count', 
                  title='How being or non being SeniorCitizen is impacting Churning?')

fig.show()

In [None]:
# plotting graph to get better understanding
fig = px.sunburst(dataset.groupby(['Churn', 'MultipleLines']).size().reset_index(name='count'), 
                  path=['Churn', 'MultipleLines'], values='count', 
                  title='How having MultipleLines is impacting Churning?')

fig.show()
# Observation: Having (yes) multiple lines have almost equal impact as not having (No) multiple lines

### Observations Based on Visualizations
* Observation-1: Whether male or female, if they do not have partner or dependents, they are more likely to churn!!¶
* Observation-2: During 0-10 years of tenure, we can see maximum churning. As the customer turns old, they might get habituated using same telecom service
* Observations-3: People with Phone services (yes) and 'Fiber optic' Internet Service are churning more
* Observations-4:more churning takes place in first 10 yrs, for customers with or without tech support. But Churning is more in case of "without tech support" customers
* Observations-5: As the monthy charges are incresing, we can see the density increasing too (60-120), which means more churning with increasing monthly charges
* Observation-6: It is quite opposite of what has been seen for monthly charges. Here high churning occurs in early phase itself, 0-2000 total charges have maximum churning
* Observation-7: Churning is being observed equally for the 'Yes', 'No' group of whether connected StreamingTv or not!
* Observation-8: Churning is being observed equally for both the 'Yes', 'No' group of StreamingMovies
* Observations-9: clearly visible that customers with month-to-month contract are the highest churners
* Observation-10: Most churners are non-senior citizens. Although we should also consider the fact that, data is more for non-senior citizens (5:1)

## Pandas Profiling
[Tutorial by Analyticsvidya on Pandas Profiling](http://www.analyticsvidhya.com/blog/2021/06/generate-reports-using-pandas-profiling-deploy-using-streamlit/)

#### Pandas Profiling consists of 3 tabs:
* 1. Overview
* 2. Warnings
* 3. Reproduction

#### The "Overview" consists of overall statistics of our dataframe passed. This includes the 
* - number of variables (features or columns of the dataframe), 
* - Number of observations (rows of dataframe), 
* - Missing cells,  
* - Percentage of missing cells, 
* - Duplicate rows, 
* - Percentage of duplicate rows, 
* - Total size in memory.


#### The "Warnings" tab provides waring info regarding cardinality, correlation among variables (features), missing values, skewness in features, etc.

#### The "reproduction" tab is on meta data of report. It provides information like start time, end time of the report generation analysis, etc.

In [None]:
from pandas_profiling import ProfileReport

# generating the profile report by passing the dataframe object to the profiling function
profile = ProfileReport(dataset)
profile
# profile.to_file("Analysis.html") --> if analysis is required in html format

#### Observation: Most important piece of info that I can take into consideration is the Warning tab, showing correlations

## Missing Values

In [None]:
dataset.isnull().sum()

##### Observations: No missing values observed

In [None]:
import missingno as msno
msno.bar(dataset)
# All the features have complete data

## Data Pre-processing

#### Splitting data into dependent and indendent variables

In [None]:
X = dataset.drop(['customerID', 'Churn'], axis=1)
y = dataset.Churn

In [None]:
# List of categorical columns
cat_cols = [col for col in X.columns if X[col].dtype == 'object'] + ['SeniorCitizen'] 
# although Senior Citizen got the 0,1 (int64) values, but basically it is a categorical column
print(cat_cols)

#### One Hot encoding of categorical columns

In [None]:
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
X_encoded = pd.DataFrame(OH_encoder.fit_transform(X[cat_cols]), index = X.index)

X_num = X.drop(cat_cols, axis=1)
X_encoded = pd.concat([X_encoded, X_num], axis=1)

In [None]:
X_encoded.head()

In [None]:
X_encoded.shape
# number of features increased

#### Feature Scaling - MinMax Scaler

In [None]:
cols = X_encoded.columns # keeping the names of all columns

from sklearn import preprocessing

scaler = preprocessing.MinMaxScaler()
x_scaled = scaler.fit_transform(X_encoded)
X_encoded_scaled = pd.DataFrame(x_scaled, columns=cols)

#### Label encoding the target column

In [None]:
y #pandas.core.series.Series
# y[0] -- is valid; y['Churn'] is not valid for series data type 

In [None]:
print("Before encoding: ", y.unique())

y = np.where(y.str.contains("Yes"), 1, 0)

print("After encoding: ", y) # Now y is a numpy array

## Model Building

#### Finding Optimum k-value for k-nearest neighbour

In [None]:
# Finding out optimum k-value using elbow method
from sklearn.neighbors import KNeighborsClassifier
def find_k_KNN(x_train, x_test, y_train, y_test):
    error_rate = []

    # calculating error rate
    for i in range(1,40):
        knn = KNeighborsClassifier(n_neighbors=i)
        knn.fit(x_train, y_train)
        pred_i = knn.predict(x_test)
        error_rate.append(np.mean(pred_i != y_test))


    # Plotting elbow graph
    plt.figure(figsize=(10,6))
    plt.plot(range(1,40), error_rate, color="green", linestyle="dashed", marker="o",
             markerfacecolor="red", markersize=10)
    plt.title("Error Rate vs. K Value")
    plt.xticks(range(1,40))
    plt.xlabel("K")
    plt.ylabel("Error Rate")

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn import svm





from sklearn.model_selection import GridSearchCV # grid search CV was taking hours to calculate all the combinations
from sklearn.model_selection import RandomizedSearchCV


def train_models(x_train, y_train, k_values):
    # defining models
    models = [LogisticRegression(penalty = 'l2'), RandomForestClassifier(), 
              XGBClassifier(use_label_encoder=False, verbosity = 0, eval_metric='logloss', tree_method = 'gpu_hist', 
                            predictor = 'gpu_predictor'),
              KNeighborsClassifier(), tree.DecisionTreeClassifier(), GaussianNB(), svm.SVC()]

    # predictor = 'gpu_predictor', tree_method = 'gpu_hist' --> to use gpu for XGBoost
    
    
    # defining model names
    model_names = ['Logistic Regression', 'Random Forest', 'Gradient Boosting Classifier', 'KNN', 'Decision Tree', 
                   'Naive Bayes', 'Support Vector Machines']

    # defining parameters

    parameters = [{'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}, # logistic regression

                 {'n_estimators':[100, 300, 500, 600, 700, 1000], 'criterion':['gini', 'entropy'], 
                  'max_depth' : [10, 20, 25, 30, 35, 40], 'min_samples_split': [100, 200, 50, 25]}, # random forest classifier

                 {"learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] , "max_depth": [ 3, 4, 5, 6, 8, 10, 12, 15],
                  "min_child_weight" : [ 1, 3, 5, 7 ], "gamma": [ 0.0, 0.1, 0.2 , 0.3, 0.4 ], "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ], 
                  "n_estimators": [100, 120, 135, 150, 165, 200]}, # xgb classifier

                 {'n_neighbors' : k_values}, # kNN

                {'criterion':['gini', 'entropy'], 'max_depth' : [10, 20, 25, 30, 35, 40], 'min_samples_split': [100, 200, 50, 25]}, # decision tree

                {}, # naive bayer

                {'kernel': ['linear', 'poly', 'rbf', 'sigmoid']} # svm
                 ]
    
    
    # training the models

    for model_idx in range(len(models)):
        
        
        if model_names[model_idx] in ['Logistic Regression', 'KNN', 'Naive Bayes', 'Support Vector Machines']:
            classifier = GridSearchCV(estimator = models[model_idx], param_grid = parameters[model_idx], n_jobs=-1)
        else:
            classifier = RandomizedSearchCV(estimator = models[model_idx], param_distributions = parameters[model_idx], random_state=0, n_jobs=-1)

        classifier.fit(x_train, y_train)

        print(model_names[model_idx])
        print(classifier.best_estimator_)
        models[model_idx] = classifier.best_estimator_ # updating the model with best hyperparameters as per training data
        
        print(f'Best train score: { classifier.best_score_}')
        print(classifier.best_params_)
        print("\n******************************************************************************************************************\n")
    
    return (models, model_names)

In [None]:
from sklearn.metrics import classification_report

def test_models(x_train, x_test, y_train, y_test, models, model_names):
    for classifier_idx in range(len(models)):
        models[classifier_idx].fit(x_train, y_train)
        y_pred = models[classifier_idx].predict(x_test)
        
        print(model_names[classifier_idx])
        print(f'{models[classifier_idx].score(x_test, y_test)*100}')
        print("\n####################")
        print(classification_report(y_test, y_pred))
        print("\n\n******************************************************************************************************************\n\n")
    

### Model creation and prediction using imbalanced data

In [None]:
%%time
# dependent and independent variables
X_encoded_scaled.shape, y.shape

# train - test split
x_train, x_test, y_train, y_test = train_test_split(X_encoded_scaled, y, test_size = 0.10, stratify=y)

find_k_KNN(x_train, x_test, y_train, y_test) # decide using the pictorial graph

In [None]:
%%time
k_values = [2, 4, 6]

# for training the models to find the best estimators based on training score
models, model_names = train_models(x_train, y_train, k_values)


In [None]:
# for testing the models to find the best test score using the best estimated parameters for each algorithm
test_models(x_train, x_test, y_train, y_test, models, model_names)

In [None]:
# choosing the final model depending on best train and test scores

classifier = LogisticRegression(solver='liblinear')

classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)


In [None]:
# classification report for imbalanced data
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

### Dealing with data imbalance!!
* Choosing a learning algorithm that provide weights for every class
* Data-level approach:
    1. Under-sampling,
    2. over-sampling
    3. Cluster-based over sampling
    4. Synthetic minority over-sampling technique(SMOTE)
        * The authors of the technique recommend using SMOTE on the minority class, followed by an undersampling technique on the majority class.
            * SMOTE + Tomek Links (Tomek links helps to identify paris of Nearest Neighbors that have different classes, and then removing these pairs (from majority class). It helps in making decision boundary less noisy.)

            * So, SMOTE is applied to oversample minority class and then Tomek links from majority classes are identified and removed (undersampling)

            * SMOTE + Edited NearestNeighbors (ENN (Edited Nearest Neighbors)), using k=3 nearest neighbors to find those examples in the dataset that have been misclassified and then remove them. It can be applied to only majority class examples or all the classes.


In [None]:
# Here, I have used SMOTE
# from imblearn.combine import SMOTEENN (UpSampling + ENN-Edited NearestNeighbors)

from imblearn.combine import SMOTEENN
sm = SMOTEENN(random_state = 42) 
# Sampling_strategy is left 'auto' which is equivalent to 'not majority', i.e., multiple copies of minority class will be generated to match the number of samples in majority class
# Then ENN  will be used to remove the misclassified majority samples
X_resample, y_resample = sm.fit_resample(X_encoded, y) # resampling using SMOTE

In [None]:
X_encoded.shape

In [None]:
X_resample.shape
# we can see first upsampling using SMOTE was done on minority class and then down sampling on majority class using ENN

In [None]:
np.unique(y, return_counts=True)

In [None]:
np.unique(y_resample, return_counts=True)

#### Feature Scaling: MinMaxScaler

In [None]:
cols = X_resample.columns # keeping the names of all columns

from sklearn import preprocessing

scaler = preprocessing.MinMaxScaler()
x_scaled = scaler.fit_transform(X_resample)
X_resample_scaled = pd.DataFrame(x_scaled, columns=cols)

### Model creation and predictions using balanced data (SMOTE + ENN)

In [None]:
# Train - test split
# again spliting train and test data
xr_train, xr_test, yr_train, yr_test = train_test_split(X_resample_scaled, y_resample, test_size=0.2)

find_k_KNN(xr_train, xr_test, yr_train, yr_test) # decide using the pictorial graph

In [None]:
k_values = [1]

# for training the models to find the best estimators based on training score
models, model_names = train_models(xr_train, yr_train, k_values)

In [None]:
# for testing the models to find the best test score using the best estimated parameters for each algorithm
test_models(xr_train, xr_test, yr_train, yr_test, models, model_names)

In [None]:
# choosing the final model depending on best test scores

classifier = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0.2, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=8,
              min_child_weight=1, missing=np.nan, monotone_constraints='()',
              n_estimators=165, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity = 0, eval_metric='logloss')

classifier.fit(xr_train, yr_train)
yr_pred = classifier.predict(xr_test)
# classifier.score(x_test, y_test)*100

In [None]:
print(classification_report(yr_test, yr_pred))

## Final Result: Balancing the data, clearly improved model's performance.