<a href="https://colab.research.google.com/github/CoolerKula/Imbalanced-Classification---Independent-Project/blob/main/Imbalanced_Classification_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###**Problem Statement**

Beta Bank customers are leaving: little by little, chipping away every month. The bankers
figured out it’s cheaper to save the existing customers rather than to attract new ones.
We need to predict whether a customer will leave the bank soon. You have the data on
clients’ past behavior and termination of contracts with the bank.
Build a model with the maximum possible F1 score. To pass the project, you need an F1
score of at least 0.59. Check the F1 for the test set.
Additionally, measure the AUC-ROC metric and compare it with the F1.
1. Download and prepare the data. Explain the procedure.
2. Examine the balance of classes. Train the model without taking into account the
imbalance. Briefly describe your findings.
3. Improve the quality of the model. Make sure you use at least two approaches to
fixing class imbalance. Use the training set to pick the best parameters. Train
different models on training and validation sets. Find the best one. Briefly
describe your findings.
4. Perform the final testing.

In [None]:
#import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report,confusion_matrix
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.utils import shuffle
from sklearn.metrics import roc_auc_score

In [None]:
# Import Data
data = pd.read_csv("https://bit.ly/2XZK7Bo")
data

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2.0,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1.0,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8.0,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1.0,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2.0,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5.0,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10.0,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7.0,0.00,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3.0,75075.31,2,1,0,92888.52,1


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           9091 non-null   float64
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(3), int64(8), object(3)
memory usage: 1.1+ MB


In [None]:
data.isnull().sum()

RowNumber            0
CustomerId           0
Surname              0
CreditScore          0
Geography            0
Gender               0
Age                  0
Tenure             909
Balance              0
NumOfProducts        0
HasCrCard            0
IsActiveMember       0
EstimatedSalary      0
Exited               0
dtype: int64

In [None]:
data1 = data.drop(['Tenure', 'Surname'], axis=1)
data1

Unnamed: 0,RowNumber,CustomerId,CreditScore,Geography,Gender,Age,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,619,France,Female,42,0.00,1,1,1,101348.88,1
1,2,15647311,608,Spain,Female,41,83807.86,1,0,1,112542.58,0
2,3,15619304,502,France,Female,42,159660.80,3,1,0,113931.57,1
3,4,15701354,699,France,Female,39,0.00,2,0,0,93826.63,0
4,5,15737888,850,Spain,Female,43,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,771,France,Male,39,0.00,2,1,0,96270.64,0
9996,9997,15569892,516,France,Male,35,57369.61,1,1,1,101699.77,0
9997,9998,15584532,709,France,Female,36,0.00,1,0,1,42085.58,1
9998,9999,15682355,772,Germany,Male,42,75075.31,2,1,0,92888.52,1


In [None]:
Numeric_data = pd.get_dummies(data1)
Numeric_data

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,1,15634602,619,42,0.00,1,1,1,101348.88,1,1,0,0,1,0
1,2,15647311,608,41,83807.86,1,0,1,112542.58,0,0,0,1,1,0
2,3,15619304,502,42,159660.80,3,1,0,113931.57,1,1,0,0,1,0
3,4,15701354,699,39,0.00,2,0,0,93826.63,0,1,0,0,1,0
4,5,15737888,850,43,125510.82,1,1,1,79084.10,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,771,39,0.00,2,1,0,96270.64,0,1,0,0,0,1
9996,9997,15569892,516,35,57369.61,1,1,1,101699.77,0,1,0,0,0,1
9997,9998,15584532,709,36,0.00,1,0,1,42085.58,1,1,0,0,1,0
9998,9999,15682355,772,42,75075.31,2,1,0,92888.52,1,0,1,0,0,1


In [None]:
Numeric_data.isnull().sum()

RowNumber            0
CustomerId           0
CreditScore          0
Age                  0
Balance              0
NumOfProducts        0
HasCrCard            0
IsActiveMember       0
EstimatedSalary      0
Exited               0
Geography_France     0
Geography_Germany    0
Geography_Spain      0
Gender_Female        0
Gender_Male          0
dtype: int64

In [None]:
Numeric_data.dtypes

RowNumber              int64
CustomerId             int64
CreditScore            int64
Age                    int64
Balance              float64
NumOfProducts          int64
HasCrCard              int64
IsActiveMember         int64
EstimatedSalary      float64
Exited                 int64
Geography_France       uint8
Geography_Germany      uint8
Geography_Spain        uint8
Gender_Female          uint8
Gender_Male            uint8
dtype: object

In [None]:
Numeric_data['Exited'].value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

In [None]:
target = Numeric_data['Exited']

features = Numeric_data.drop(['Exited'], axis=1)

# set aside 20% of train and test data for evaluation
features_train, features_test, target_train, target_test = train_test_split(features, target,
    test_size=0.2, shuffle = True, random_state = 12345)

features_train, features_valid, target_train, target_valid = train_test_split(

    features, target, test_size=0.25, random_state=12345
)

print(features_train.shape)
print(target_train.shape)
print(features_valid.shape)
print(target_valid.shape)
print(features_test.shape)
print(target_test.shape)

(7500, 14)
(7500,)
(2500, 14)
(2500,)
(2000, 14)
(2000,)


In [None]:
model = RandomForestClassifier(random_state=12345, n_estimators=10)
        
model.fit(features_train,target_train)

print('Validation set:',model.score(features_valid, target_valid))
print('Training set:',model.score(features_train,target_train))
print('Test set:',model.score(features_test,target_test))

Validation set: 0.8392
Training set: 0.984
Test set: 0.836


In [None]:
model = DecisionTreeClassifier(random_state=12345)
        
model.fit(features_train,target_train)

print('Validation set:',model.score(features_valid, target_valid))
print('Training set:',model.score(features_train,target_train))
print('Test set:',model.score(features_test,target_test))

Validation set: 0.774
Training set: 1.0
Test set: 0.774


In [None]:
model = LogisticRegression(random_state=12345, class_weight='balanced', solver='liblinear')
        
model.fit(features_train,target_train)

print('Validation set:',model.score(features_valid, target_valid))
print('Training set:',model.score(features_train,target_train))
print('Test set:',model.score(features_test,target_test))

Validation set: 0.7072
Training set: 0.7118666666666666
Test set: 0.707


In [None]:
predictions_valid = model.predict(features_valid)
print('accuracy_score: ',accuracy_score(target_valid, predictions_valid))
print(' f1 score: ',f1_score(target_valid, predictions_valid)*100,'\n')

accuracy_score:  0.7072
 f1 score:  50.47361299052774 



In [None]:
#Upsampling

def upsample(features, target, repeat):
    features_zeros = features[target == 0]
    features_ones = features[target == 1]
    target_zeros = target[target == 0]
    target_ones = target[target == 1]

    features_upsampled = pd.concat([features_zeros] + [features_ones] * repeat)
    target_upsampled = pd.concat([target_zeros] + [target_ones] * repeat)

    features_upsampled, target_upsampled = shuffle(
        features_upsampled, target_upsampled, random_state=12345
    )

    return features_upsampled, target_upsampled


features_upsampled, target_upsampled = upsample(
    features_train, target_train, 10
)

model = LogisticRegression(random_state=12345, solver='liblinear')
model.fit(features_train, target_train)
predicted_valid = model.predict(features_valid)

print('F1:', f1_score(target_valid, predicted_valid))

F1: 0.0


In [None]:
#Downsampling

def downsample(features, target, fraction):
    features_zeros = features[target == 0]
    features_ones = features[target == 1]
    target_zeros = target[target == 0]
    target_ones = target[target == 1]

    features_downsampled = pd.concat(
        [features_zeros.sample(frac=fraction, random_state=12345)]
        + [features_ones]
    )
    target_downsampled = pd.concat(
        [target_zeros.sample(frac=fraction, random_state=12345)]
        + [target_ones]
    )

    features_downsampled, target_downsampled = shuffle(
        features_downsampled, target_downsampled, random_state=12345
    )

    return features_downsampled, target_downsampled


features_downsampled, target_downsampled = downsample(
    features_train, target_train, 0.1
)

model = LogisticRegression(solver='liblinear')
model.fit(features_downsampled, target_downsampled)
predicted_valid = model.predict(features_valid)

print('F1:', f1_score(target_valid, predicted_valid))

F1: 0.3525535420098847


In [None]:
#auc_roc
probabilities_valid = model.predict_proba(features_valid)
probabilities_one_valid = probabilities_valid[:, 1]

auc_roc = roc_auc_score(target_valid,probabilities_one_valid)

print(auc_roc)

0.5582525980357186
