In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, KBinsDiscretizer
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import precision_recall_fscore_support

In [2]:
# checking to see if there is any white space in any of the columns, if so convert them to null value
def get_whitespace_count(df):
    columns = df.columns
    dict = {}
    for col in columns:
        dict[col] = df[col].str.isspace().sum() if df[col].dtype == 'object' else -1

    print(dict)

In [3]:
def get_binary_col_count(df, columns):
    return [ col for col in columns if len(df[col].value_counts()) == 2]

In [3]:
def get_all_cols(df):
    columns = list(df.columns)
    columns_with_nan = df.columns[df.isna().any()].tolist()
    num_cols = list(df._get_numeric_data().columns)
    cat_cols = list(set(columns) - set(num_cols))
    cat_cols_with_nan = set(columns_with_nan) - set(num_cols)
    num_cols_with_nan = set(columns_with_nan) - set(cat_cols)
    binary_cols = get_binary_col_count(df, columns)

    return columns, columns_with_nan, num_cols, cat_cols, cat_cols_with_nan, num_cols_with_nan, binary_cols

In [5]:
def get_value_counts(df, num_cols):
    for col in num_cols:
        print(df[col].value_counts())


def transformStandardScaler(df, transformable_columns, label):
    test = df.copy()

    if label in transformable_columns:
        transformable_columns.remove(label)

    test[transformable_columns] = StandardScaler().fit_transform(test[transformable_columns])

    return test


def minMaxScaler(df, transformable_columns, label):
    test = df.copy()

    if label in transformable_columns:
        transformable_columns.remove(label)

    for col in transformable_columns:
        test[col] = MinMaxScaler().fit_transform(test[[col]])

    return test


def transformKBinsDiscretizer(df, transformable_columns, label, bins):
    test = df.copy()

    if label in transformable_columns:
        transformable_columns.remove(label)

    for col in transformable_columns:
        est = KBinsDiscretizer(n_bins=bins, encode='ordinal', strategy='uniform')
        test[col] = est.fit_transform(test[[col]])

    return test

In [112]:
class MetricCalculator:
    def __init__(self, y_real, y_pred) -> None:
        self.TP = 0
        self.TN = 0
        self.FP = 0
        self.FN = 0
        self.y_real = y_real
        self.y_pred = y_pred
        self.num_rows = len(y_pred)

    def calculate_cf_matrix_fields(self):

        #self.y_real[self.y_real == 0] = -1

        for index in range(self.num_rows):
            if self.y_real[index] == 1 and self.y_pred[index] == 1:
                self.TP += 1
            if self.y_pred[index] == 1 and self.y_real[index] == 0:
                self.FP += 1
            if self.y_real[index] == 0 and self.y_pred[index] == 0:
                self.TN += 1
            if self.y_pred[index] == 0 and self.y_real[index] == 1:
                self.FN += 1


    def get_cf_field(self):
        return self.TP, self.TN, self.FP, self.FN

    def calculate_all_metric(self):

        self.calculate_cf_matrix_fields()
        print(f'TP: {self.TP}')
        print(f'TN: {self.TN}')
        print(f'FP: {self.FP}')
        print(f'FN: {self.FN}')

        self.calculate_accuracy()
        self.calculate_recall()
        self.calculate_specificity()
        self.calculate_precision()
        self.calculate_false_discovery_rate()
        self.calculate_f1_score()

        print(f'Accuracy: {self.calculate_accuracy()}')
        print(f'Recall: {self.calculate_recall()}')
        print(f'Specificity: {self.calculate_specificity()}')
        print(f'Precision: {self.calculate_precision()}')
        print(f'False Discovery Rate: {self.calculate_false_discovery_rate()}')
        print(f'F1 score: {self.calculate_f1_score()}')

    def calculate_accuracy(self):
        return (self.TP + self.TN) / (self.TP + self.TN + self.FP + self.FN)

    def calculate_precision(self):
        return self.TP / (self.TP + self.FP)

    def calculate_recall(self):
        return self.TP / (self.TP + self.FN)

    def calculate_specificity(self):
        return self.TN / (self.TN + self.FP)

    def calculate_false_discovery_rate(self):
        return self.FP / (self.FP + self.TP)

    def calculate_f1_score(self):
        return (2*self.TP) / (2*self.TP + self.FP + self.FN)

In [101]:
class InformationGain:

    def __init__(self, df, num_cols, label):
        self.df = df
        self.num_cols = num_cols
        self.label = label
        self.original_columns = self.df.columns

    def get_final_column_list(self, num_of_features):
        self.cont_to_bins_pipeline() # careful here!
        gain_dict = {col: self.calculate_gain(col) for col in list(set(self.df.columns) - {self.label})}
        sorted_gain_dict = {k: v for k, v in sorted(gain_dict.items(), key=lambda item: item[1])}
        print('Sorted gain dict:', sorted_gain_dict)
        cols_to_drop = list(sorted_gain_dict.keys())[:len(sorted_gain_dict) - num_of_features]
        final_cols = list(set(self.original_columns ) - set(cols_to_drop))
        # print('Final Cols are: ==>')
        # print(final_cols)

        return final_cols

    def calculate_gain(self, attribute):
        if self.label != attribute:
            p = len(self.df[self.df[self.label] == 1])
            df_size = len(self.df) # p + n
            data_entropy = self.calculate_entropy(p/df_size)
            #print('Data entropy is', data_entropy)

            attribute_remainder = self.calculate_remainder(attribute)
            return data_entropy - attribute_remainder


    def calculate_remainder(self, attribute):
        unique_vals, num_of_unique = np.unique(self.df[attribute], return_counts=True)

        remainder_sum = 0
        for index, attrib_val in enumerate(unique_vals):

            # choosing the rows equal to the unique value in the attribute
            filtered_df = self.df.where(self.df[attribute] == attrib_val).dropna()

            # calculating number of positive classed rows for the given attribute's unique value
            pk = len(filtered_df[filtered_df[self.label] == 1])
            nk = len(filtered_df[filtered_df[self.label] == 0])

            # sanity check
            assert nk == len(filtered_df) - pk

            prob = (pk + nk) / (len(self.df))
            attr_entropy = self.calculate_entropy(pk / (pk + nk))
            remainder_sum += prob * attr_entropy

        return  remainder_sum

    @staticmethod
    def calculate_entropy(q):
        if q > 0:
            return -1 * (q * np.log2(q) + (1-q) * np.log2(1-q))
        return 0

    def convert_cont_to_bins(self,  old_col_name):
        min = self.df[old_col_name].min()
        median = self.df[old_col_name].median()

        if min != median:
            #print(old_col_name)
            self.df[old_col_name+'_'] = pd.qcut(self.df[old_col_name], q=4, labels=['q1', 'q2', 'q3', 'q4'])
            self.df.drop(columns=[old_col_name], inplace=True)
            self.df.rename(columns={old_col_name+'_': old_col_name}, inplace=True)

    def cont_to_bins_pipeline(self):
        for col in list(set(self.num_cols) - set(self.label)):
            self.convert_cont_to_bins(col)


In [102]:
def preprocess_churn_data(df, label, num_of_features):

    print(len(df))

    df.dropna(axis=0, subset=[label])

    print(len(df))

    # drop the customer ID column in the dataset
    df.drop('customerID', axis=1, inplace=True)

    # converting the labels(y) to numeric labels
    print(df.head())
    label_encoder = preprocessing.LabelEncoder()
    df[label] = label_encoder.fit_transform(df[label])
    print(df.head())

    print("\nMissing values :  ", df.isnull().sum().values.sum())

    # get the whitespace  counts and remove them
    get_whitespace_count(df)

    df['TotalCharges'] = df['TotalCharges'].replace(r'^\s*$', np.NaN, regex=True)

    get_whitespace_count(df)

    print("\nMissing values :  ", df.isnull().sum())

    # converting a single column to float
    # df[cols] = df[cols].apply(pd.to_numeric, errors='coerce') where cols are required columns we want to convert
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], downcast="float", errors='coerce')

    print("\nBefore Missing values :  ", df.isnull().sum())

    # replacing the missing values with mean for total charges
    df['TotalCharges'].fillna(value=df['TotalCharges'].mean(), inplace=True)

    print("\nAfter Missing values :  ", df.isnull().sum())

    if num_of_features != -1:
        # get all columns
        columns, columns_with_nan, num_cols, cat_cols, cat_cols_with_nan, num_cols_with_nan, binary_cols = get_all_cols(df)

        # gain computations
        temp_df = df.copy()
        gainFilter = InformationGain(temp_df,num_cols, label)
        final_cols = gainFilter.get_final_column_list(num_of_features=num_of_features)

        # print('Final Cols are: ==>')
        # print(final_cols)
        df = df[final_cols]

    # get all columns
    columns, columns_with_nan, num_cols, cat_cols, cat_cols_with_nan, num_cols_with_nan, binary_cols = get_all_cols(df)

    # removing categorical columns with mode( most frequent value)
    for cat_col in cat_cols_with_nan:
        df[cat_col].fillna(value=df[cat_col].mode()[0], inplace=True)

    print(df.isnull().sum())

    # removing numerical columns with mean value
    for num_col in num_cols_with_nan:
        df[num_col].fillna(value=df[num_col].mean(), inplace=True)

    print(df.isnull().sum())

    # one hot encoding the categorical cols
    df = pd.get_dummies(df, columns=list(set(cat_cols) - set(binary_cols)))

    # label encoding the binary cols
    for col in list(set(binary_cols) - set(label)):
        print('Label Encoding: ', col)
        label_encoder = preprocessing.LabelEncoder()
        df[col] = label_encoder.fit_transform(df[col])

    if len(list(set(num_cols) - set(binary_cols))) != 0:
        df = transformStandardScaler(df, list(set(num_cols) - set(binary_cols)), label)

    # changing the lables from 0,1 to -1,1
    #df[label] = df[label].replace([0],-1)

    df.reset_index(inplace = True, drop = True)

    return df


In [103]:
df = pd.read_csv('./data/cust_churn.csv')
df = preprocess_churn_data(df, 'Churn', 5)
df.head()

7043
7043
   gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  Female              0     Yes         No       1           No   
1    Male              0      No         No      34          Yes   
2    Male              0      No         No       2          Yes   
3    Male              0      No         No      45           No   
4  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity OnlineBackup  \
0  No phone service             DSL             No          Yes   
1                No             DSL            Yes           No   
2                No             DSL            Yes          Yes   
3  No phone service             DSL            Yes           No   
4                No     Fiber optic             No           No   

  DeviceProtection TechSupport StreamingTV StreamingMovies        Contract  \
0               No          No          No              No  Month-to-month   
1              Yes    

Unnamed: 0,tenure,Churn,TechSupport_No,TechSupport_No internet service,TechSupport_Yes,InternetService_DSL,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No,OnlineSecurity_No internet service,OnlineSecurity_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year
0,-1.277445,0,1,0,0,1,0,0,1,0,0,1,0,0
1,0.066327,0,1,0,0,1,0,0,0,0,1,0,1,0
2,-1.236724,1,1,0,0,1,0,0,0,0,1,1,0,0
3,0.514251,0,0,0,1,1,0,0,0,0,1,0,1,0
4,-1.236724,1,1,0,0,0,1,0,1,0,0,1,0,0


In [121]:
def logr( df, label, max_iter = 100):
    y = df[label]
    X = df.drop(label, axis=1)

    assert X.shape[0] == y.shape[0]

    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=1)
    clf = LogisticRegression(random_state=0, max_iter=max_iter).fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    print(clf.score(X_test, y_test))

    y_pred = np.array(y_pred).reshape(y_pred.shape[0],1)
    y_test = np.array(y_test).reshape(y_test.shape[0],1)
    mt = MetricCalculator(y_test, y_pred)
    mt.calculate_all_metric()

In [122]:
def adaboost( df, label):
    y = df[label]
    X = df.drop(label, axis=1)

    assert X.shape[0] == y.shape[0]

    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=1)
    clf = AdaBoostClassifier(n_estimators=100, random_state=0).fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf)
    print(clf.score(X_test, y_test))

    y_pred = np.array(y_pred).reshape(y_pred.shape[0],1)
    y_test = np.array(y_test).reshape(y_test.shape[0],1)
    mt = MetricCalculator(y_test, y_pred)
    mt.calculate_all_metric()

In [123]:
def dt( df, label):
    y = df[label]
    X = df.drop(label, axis=1)

    assert X.shape[0] == y.shape[0]

    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=1)
    clf = DecisionTreeClassifier(criterion='entropy', random_state=0).fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    print(tp)
    print(tn)
    print(fp)
    print(fn)

    res = precision_recall_fscore_support(y_test, y_pred, average='macro')
    print(clf.score(X_test, y_test))
    print(res)

    y_pred = np.array(y_pred).reshape(y_pred.shape[0],1)
    y_test = np.array(y_test).reshape(y_test.shape[0],1)
    mt = MetricCalculator(y_test, y_pred)
    mt.calculate_all_metric()


In [124]:
logr(df, 'Churn', max_iter=1000)

0.7977288857345636
TP: 182
TN: 942
FP: 119
FN: 166
Accuracy: 0.7977288857345636
Recall: 0.5229885057471264
Specificity: 0.8878416588124411
Precision: 0.6046511627906976
False Discovery Rate: 0.3953488372093023
F1 score: 0.5608628659476117


In [120]:
adaboost(df, 'Churn')

AdaBoostClassifier(n_estimators=100, random_state=0)
0.7934705464868701
TP: 179
TN: 939
FP: 122
FN: 169
Accuracy: 0.7934705464868701
Recall: 0.514367816091954
Specificity: 0.8850141376060321
Precision: 0.5946843853820598
False Discovery Rate: 0.4053156146179402
F1 score: 0.551617873651772


In [113]:
dt(df, 'Churn')

193
904
157
155
0.7785663591199432
(0.7025320383110751, 0.7033120456736759, 0.702919392333892, None)
TP: 193
TN: 904
FP: 157
FN: 155
Accuracy: 0.7785663591199432
Recall: 0.5545977011494253
Specificity: 0.8520263901979265
Precision: 0.5514285714285714
False Discovery Rate: 0.44857142857142857
F1 score: 0.5530085959885387
