## Sampling Methods and Checking Accuracy and F1-score using Logistic Regression

In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
import numpy as np

In [2]:
df = pd .read_csv("DataSets/Customer_Churn_Cleaned.csv")
df.drop(columns = ['Unnamed: 0'],inplace =True)

In [3]:
X = df.drop(columns = ['Churn'])
y = df.Churn
class_0 = df[df['Churn'] == 0]
class_1 = df[df['Churn'] == 1]

In [4]:
def DownSampling(y0,y1,df):
        
        class_0_count,class_1_count = df.Churn.value_counts()
        low_count = min(class_0_count,class_1_count)
        
        if low_count == class_0_count:
            y1_DS = y1.sample(low_count)
#             Here we down sample y1
            df_set1 = pd.concat([y1_DS,y0])
    
        if low_count == class_1_count:
            y0_DS = y0.sample(low_count)
#             Here we down sample y1
            df_set1 = pd.concat([y0_DS,y1])
    
        X = df_set1.drop(columns = ['Churn'])
        y = df_set1.Churn 
        
        X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 10,stratify = y)
        
        model = LogisticRegression()
        model.fit(X_train,y_train)
        predict = model.predict(X_test)
        predict = np.round(predict)
        print(classification_report(y_test,predict))
        

In [5]:
def OverSampling(y0,y1,df):
        
        class_0_count,class_1_count = df.Churn.value_counts()
        high_count = max(class_0_count,class_1_count)
        
        if high_count == class_0_count:
            y1_DS = y1.sample(high_count,replace = True)
#             Here we down sample y1
            df_set1 = pd.concat([y1_DS,y0])
    
        if high_count == class_1_count:
            y0_DS = y0.sample(high_count,replace = True)
#             Here we down sample y1
            df_set1 = pd.concat([y0_DS,y1])
    
        X = df_set1.drop(columns = ['Churn'])
        y = df_set1.Churn 
        
        X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 10,stratify = y)
        
        model = LogisticRegression()
        model.fit(X_train,y_train)
        predict = model.predict(X_test)
        predict = np.round(predict)
        print(classification_report(y_test,predict))

In [6]:
def SmtSampling(X,y,df):
        
        smt = SMOTE(sampling_strategy='minority')
        X,y = smt.fit_resample(X,y)
        
        X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 10)
        
        model = LogisticRegression()
        model.fit(X_train,y_train)
        predict = model.predict(X_test)
        predict = np.round(predict)
        print(classification_report(y_test,predict))

In [7]:
print(DownSampling(class_0,class_1,df))
print(OverSampling(class_0,class_1,df))
print(SmtSampling(X,y,df))

              precision    recall  f1-score   support

           0       0.77      0.74      0.76       374
           1       0.75      0.78      0.77       374

    accuracy                           0.76       748
   macro avg       0.76      0.76      0.76       748
weighted avg       0.76      0.76      0.76       748

None
              precision    recall  f1-score   support

           0       0.80      0.74      0.77      1035
           1       0.76      0.81      0.78      1035

    accuracy                           0.77      2070
   macro avg       0.78      0.77      0.77      2070
weighted avg       0.78      0.77      0.77      2070

None
              precision    recall  f1-score   support

           0       0.83      0.79      0.81      1057
           1       0.79      0.84      0.81      1013

    accuracy                           0.81      2070
   macro avg       0.81      0.81      0.81      2070
weighted avg       0.81      0.81      0.81      2070

None


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
# Here in each case accuracy and f1-score is increased due to sampling of imbalanced data