In [1]:
"""# Import Libraries"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

In [2]:
#ignore warning messages 
import warnings
warnings.filterwarnings('ignore') 

In [3]:
"""# Import Datasets"""

df_train = pd.read_csv('Train_Dataset.csv')

In [4]:
def set_outliers_to_nan(p , feature_boundries):
  df = p.copy()

  for key in feature_boundries.keys():
    feature = key
    lower_bound = feature_boundries[key][0]
    upper_bound = feature_boundries[key][1]
    print(feature , lower_bound , upper_bound)
    df.loc[df[feature] < lower_bound , feature] = np.nan
    df.loc[df[feature] > upper_bound, feature] = np.nan
  return df

def isNaN(num):
    if float('-inf') < float(num) < float('inf'):
        return False
    else:
        return True

def linear_regression_imputation_to_nan(p, corr_features):
    df = p.copy()
    for c_f in corr_features:
        # impute median for both same index values are nan
        f1 = c_f[0]
        f2 = c_f[1]
        df.loc[df[f1].isnull() & df[f2].isnull(), f1] = df[f1].median()
        df.loc[df[f1].isnull() & df[f2].isnull(), f2] = df[f2].median()

        model1 = LinearRegression()
        model2 = LinearRegression()
        data = df.dropna(subset=c_f)
        
        data_f1 = data[[f1]]
        data_f2 = data[[f2]]
      
        model1.fit(data_f1, data_f2)
        model2.fit(data_f2, data_f1)
        
        for i in df.index:
            if isNaN(df[f1][i]):
                x = np.array([[df[f2][i]]])
                df[f1][i] = model2.predict(x)[0][0]

            elif isNaN(df[f2][i]):
                x = np.array([[df[f1][i]]])

                df[f2][i] = model1.predict(x)[0][0]

    return df

def categorical_imputation_by_most_frequent_value(p, features):
    df = p.copy()
    for f in features:
        frq_val = df_train[~df_train[f].isnull()][f].value_counts().idxmax()
        df[f] = df[f].fillna(frq_val)
    return df

def nan_imputation_by_median(p, features):
    df = p.copy()
    for feature in features:
        df.loc[df[feature].isnull(), feature] = df[feature].median()
    return df

def show_correlations(dataframe, show_chart = True):
    fig = plt.figure(figsize = (20,10))
    corr = dataframe.corr()
    if show_chart == True:
        sns.heatmap(corr, 
                    xticklabels=corr.columns.values,
                    yticklabels=corr.columns.values,
                    annot=True)
    return corr

def kdeplot(feature , df):
    plt.figure(figsize=(9, 4))
    plt.title("KDE for {}".format(feature))
    ax0 = sns.kdeplot(df[df['Churn'] == 0][feature], color= 'navy', label= 'Churn: No')
    ax1 = sns.kdeplot(df[df['Churn'] == 1][feature], color= 'orange', label= 'Churn: Yes')
    plt.show()

In [5]:
"""# Define Variables"""

cat_col = ['location_code' , 'intertiol_plan' , 'voice_mail_plan' , 'Churn']

num_col = ['account_length',
             'number_vm_messages',
             'total_day_min',
             'total_day_calls',
             'total_day_charge',
             'total_eve_min',
             'total_eve_calls',
             'total_eve_charge',
             'total_night_minutes',
             'total_night_calls',
             'total_night_charge',
             'total_intl_minutes',
             'total_intl_calls',
             'total_intl_charge',
             'customer_service_calls']

feature_boundries = {'account_length': [0, 210],
                    'number_vm_messages': [0, 50],
                    'total_day_min': [0, 350],
                    'total_day_calls': [0, 160],
                    'total_day_charge': [0, 60],
                    'total_eve_min': [0, 400],
                    'total_eve_calls': [40, 170],
                    'total_eve_charge': [3, 31],
                    'total_night_minutes': [23, 400],
                    'total_night_calls': [30, 175],
                    'total_night_charge': [0, 20],
                    'total_intl_minutes': [0, 20],
                    'total_intl_calls': [0, 18],
                    'total_intl_charge': [0, 5],
                    'customer_service_calls': [0, 9]}

correlated_feature_couple = [['total_eve_charge', 'total_eve_min'],
                   ['total_night_charge', 'total_night_minutes'],
                   ['total_intl_charge', 'total_intl_minutes'],
                   ['total_day_charge', 'total_day_min']]

cat_col1 = ['location_code' , 'intertiol_plan' , 'voice_mail_plan']

non_corr_numer_col = ['account_length',
             'number_vm_messages',
             'total_day_calls',
             'total_eve_calls',
             'total_night_calls',
             'total_intl_calls',
             'customer_service_calls']

onehot_col = ['location_code_452' , 'location_code_445' ,'location_code_547']

In [6]:
"""# Drop Unwanted Columns"""

df_train = df_train.drop(columns=['Unnamed: 20'] , errors='ignore')

In [7]:
df1 = df_train.drop(columns=['customer_id'])

df1.duplicated().sum()

4

In [8]:
"""# Drop Duplicates"""

df1 = df_train.drop(columns=['customer_id'])

duplicates = df1.duplicated()

i = 0
dup_index = []
while i<=2320:
  if(duplicates[i]): dup_index.append(i)
  i += 1

df_train = df_train.drop(labels=dup_index, axis=0)

In [9]:
"""# Handle Invalid Data"""

for i in num_col:
  df_train[i] = np.where(df_train[i] < 0, np.NaN , df_train[i])

In [10]:
"""# Handle Outliers"""

df_train = set_outliers_to_nan(df_train , feature_boundries)

account_length 0 210
number_vm_messages 0 50
total_day_min 0 350
total_day_calls 0 160
total_day_charge 0 60
total_eve_min 0 400
total_eve_calls 40 170
total_eve_charge 3 31
total_night_minutes 23 400
total_night_calls 30 175
total_night_charge 0 20
total_intl_minutes 0 20
total_intl_calls 0 18
total_intl_charge 0 5
customer_service_calls 0 9


In [11]:
"""# Handle Missing Values"""

df_train = linear_regression_imputation_to_nan(df_train , correlated_feature_couple)

df_train = categorical_imputation_by_most_frequent_value(df_train , cat_col1)

df_train = nan_imputation_by_median(df_train , non_corr_numer_col)

df_train = df_train[~df_train['Churn'].isnull()]

In [12]:
df_train_onehot = df_train.copy()

In [13]:
enc = OneHotEncoder(handle_unknown='ignore')
enc_df_train = pd.DataFrame(enc.fit_transform(df_train_onehot[['location_code']]).toarray())

In [14]:
df_train_onehot.insert(2 , 'location_code_452' , enc_df_train[0].values , False)
df_train_onehot.insert(3 , 'location_code_445' , enc_df_train[1].values , False)
df_train_onehot.insert(4 , 'location_code_547' , enc_df_train[2].values , False)

In [15]:
df_train[onehot_col] = df_train_onehot[onehot_col]

In [16]:
df_train = df_train.drop(columns=['location_code'])

In [17]:
df_train.voice_mail_plan = df_train.voice_mail_plan.map(dict(yes=1, no=0))

In [18]:
df_train.intertiol_plan = df_train.intertiol_plan.map(dict(yes=1, no=0))

In [19]:
df_train.Churn = df_train.Churn.map(dict(Yes=1, No=0))

In [20]:
# new features
df_train['total_charge'] = df_train['total_intl_charge'] + df_train['total_night_charge'] + df_train[
    'total_eve_charge'] + df_train['total_day_charge']

In [21]:
df_train['total_calls'] = df_train['total_intl_calls'] + df_train['total_night_calls'] + df_train['total_eve_calls'] + \
                          df_train['total_day_calls']

In [22]:
df_train['total_min'] = df_train['total_intl_minutes'] + df_train['total_night_minutes'] + df_train['total_eve_min'] + \
                        df_train['total_day_min']

In [23]:
df_train["no_of_plans"] = df_train['intertiol_plan'] + df_train['voice_mail_plan']

In [24]:
df = df_train.drop(columns=['customer_id'])

In [25]:
df.isnull().sum()

account_length            0
intertiol_plan            0
voice_mail_plan           0
number_vm_messages        0
total_day_min             0
total_day_calls           0
total_day_charge          0
total_eve_min             0
total_eve_calls           0
total_eve_charge          0
total_night_minutes       0
total_night_calls         0
total_night_charge        0
total_intl_minutes        0
total_intl_calls          0
total_intl_charge         0
customer_service_calls    0
Churn                     0
location_code_452         0
location_code_445         0
location_code_547         0
total_charge              0
total_calls               0
total_min                 0
no_of_plans               0
dtype: int64

In [26]:
df.head(10)

Unnamed: 0,account_length,intertiol_plan,voice_mail_plan,number_vm_messages,total_day_min,total_day_calls,total_day_charge,total_eve_min,total_eve_calls,total_eve_charge,...,total_intl_charge,customer_service_calls,Churn,location_code_452,location_code_445,location_code_547,total_charge,total_calls,total_min,no_of_plans
0,15.0,0,0,0.0,121.1,130.0,20.59,216.0,86.0,18.36,...,4.35,2.0,0,0.0,1.0,0.0,53.88,254.0,588.3,0
1,105.0,0,0,0.0,259.3,96.0,44.08,175.2,97.0,14.89,...,3.24,3.0,0,0.0,1.0,0.0,72.22,234.0,668.9,0
2,97.0,0,1,32.0,183.4,94.0,31.18,269.1,120.0,22.87,...,1.81,5.0,0,0.0,1.0,0.0,65.02,256.0,662.7,1
3,121.0,0,1,35.0,68.7,95.0,11.68,209.2,69.0,17.78,...,3.08,1.0,0,1.0,0.0,0.0,41.42,210.0,486.7,1
4,98.0,0,1,22.0,278.3,89.0,47.31,93.4,143.0,7.94,...,2.62,0.0,0,0.0,1.0,0.0,62.71,279.0,489.0,1
5,151.0,0,1,17.0,214.7,97.0,36.5,138.5,90.0,11.77,...,2.32,1.0,0,1.0,0.0,0.0,58.2,235.0,530.9,1
6,170.0,0,1,42.0,185.5,111.0,34.92,145.0,94.0,11.48,...,2.94,4.0,1,0.0,1.0,0.0,57.65,252.0,517.0,1
7,39.0,0,0,0.0,60.4,158.0,10.27,306.2,120.0,26.03,...,3.35,1.0,0,1.0,0.0,0.0,45.23,327.0,502.9,0
8,94.0,0,0,0.0,262.2,105.0,48.1,216.6,102.0,18.22,...,3.11,3.0,1,0.0,0.0,1.0,79.4,257.0,725.9,0
9,93.0,1,0,0.0,168.4,114.0,28.63,276.0,127.0,23.46,...,3.08,1.0,0,0.0,0.0,1.0,64.0,292.0,652.0,1


In [27]:
y = df['Churn']
X = df.drop(columns = ['Churn'])

In [44]:
df.describe().reset_index()

Unnamed: 0,index,account_length,intertiol_plan,voice_mail_plan,number_vm_messages,total_day_min,total_day_calls,total_day_charge,total_eve_min,total_eve_calls,...,total_intl_charge,customer_service_calls,Churn,location_code_452,location_code_445,location_code_547,total_charge,total_calls,total_min,no_of_plans
0,count,2312.0,2312.0,2312.0,2312.0,2312.0,2312.0,2312.0,2312.0,2312.0,...,2312.0,2312.0,2312.0,2312.0,2312.0,2312.0,2312.0,2312.0,2312.0,2312.0
1,mean,100.853374,0.12154,0.257353,7.642301,182.366453,100.734429,31.000186,201.517063,100.387543,...,2.773699,1.650952,0.248702,0.259083,0.495242,0.245675,59.953998,293.176038,595.185777,0.378893
2,std,39.1934,0.326824,0.43727,13.552296,57.567948,20.232476,9.745887,50.928594,19.530854,...,0.732983,1.429327,0.432354,0.438226,0.500086,0.43058,11.171265,31.292975,95.217311,0.546453
3,min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,42.0,...,0.0,0.0,0.0,0.0,0.0,0.0,22.93,191.0,284.3,0.0
4,25%,74.0,0.0,0.0,0.0,144.2,87.0,24.5075,165.975,87.0,...,2.32,1.0,0.0,0.0,0.0,0.0,52.4175,272.0,530.875,0.0
5,50%,101.0,0.0,0.0,0.0,180.45,102.0,30.6,202.5,101.0,...,2.78,1.0,0.0,0.0,0.0,0.0,59.53,294.0,595.5,0.0
6,75%,126.0,0.0,1.0,14.0,221.0,115.0,37.605,236.4,114.0,...,3.24,2.0,0.0,1.0,1.0,0.0,67.655,314.0,659.9,1.0
7,max,210.0,1.0,1.0,50.0,350.979402,160.0,59.64,358.5,170.0,...,4.94,9.0,1.0,1.0,1.0,1.0,96.15,386.0,885.0,2.0


In [28]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [29]:
# AdaBoost Algorithm
from sklearn.ensemble import AdaBoostClassifier
model2 = AdaBoostClassifier()
# n_estimators = 50 (default value) 
# base_estimator = DecisionTreeClassifier (default value)
model2.fit(X_train,y_train)
preds = model2.predict(X_test)
metrics.accuracy_score(y_test, preds)

0.8746397694524496

In [30]:
from xgboost import XGBClassifier
from sklearn import metrics
model3 = XGBClassifier()
model3.fit(X_train, y_train)
preds = model3.predict(X_test)
metrics.accuracy_score(y_test, preds)



0.9697406340057637

In [31]:
from catboost import CatBoostClassifier


model4 = CatBoostClassifier(
    iterations=100, 
    learning_rate=0.05, 
    #loss_function='CrossEntropy'
)


model4.fit(X_train, y_train)

preds = model4.predict(X_test)
metrics.accuracy_score(y_test, preds)

0:	learn: 0.6370330	total: 72.2ms	remaining: 7.15s
1:	learn: 0.5920734	total: 78.4ms	remaining: 3.84s
2:	learn: 0.5461218	total: 84.3ms	remaining: 2.73s
3:	learn: 0.5077606	total: 90.6ms	remaining: 2.17s
4:	learn: 0.4827896	total: 97.1ms	remaining: 1.84s
5:	learn: 0.4599814	total: 104ms	remaining: 1.63s
6:	learn: 0.4317297	total: 110ms	remaining: 1.46s
7:	learn: 0.4086906	total: 117ms	remaining: 1.35s
8:	learn: 0.3817432	total: 123ms	remaining: 1.25s
9:	learn: 0.3646632	total: 130ms	remaining: 1.17s
10:	learn: 0.3439774	total: 136ms	remaining: 1.1s
11:	learn: 0.3306979	total: 142ms	remaining: 1.04s
12:	learn: 0.3140945	total: 149ms	remaining: 999ms
13:	learn: 0.3043804	total: 155ms	remaining: 954ms
14:	learn: 0.2923676	total: 162ms	remaining: 916ms
15:	learn: 0.2849960	total: 169ms	remaining: 886ms
16:	learn: 0.2790119	total: 175ms	remaining: 856ms
17:	learn: 0.2688333	total: 183ms	remaining: 834ms
18:	learn: 0.2642538	total: 190ms	remaining: 810ms
19:	learn: 0.2572765	total: 197ms	rem

0.962536023054755

In [32]:
model_xg = XGBClassifier()
model_xg.fit(X, y)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [48]:
importances = model_xg.feature_importances_
importances.size
X.columns

Index(['account_length', 'intertiol_plan', 'voice_mail_plan',
       'number_vm_messages', 'total_day_min', 'total_day_calls',
       'total_day_charge', 'total_eve_min', 'total_eve_calls',
       'total_eve_charge', 'total_night_minutes', 'total_night_calls',
       'total_night_charge', 'total_intl_minutes', 'total_intl_calls',
       'total_intl_charge', 'customer_service_calls', 'location_code_452',
       'location_code_445', 'location_code_547', 'total_charge', 'total_calls',
       'total_min', 'no_of_plans'],
      dtype='object')

In [33]:
model_cat = CatBoostClassifier(iterations=100, 
    learning_rate=0.05,)
model_cat.fit(X, y)

0:	learn: 0.6325926	total: 5.92ms	remaining: 587ms
1:	learn: 0.5740583	total: 13ms	remaining: 639ms
2:	learn: 0.5273390	total: 18.1ms	remaining: 586ms
3:	learn: 0.4904729	total: 23.2ms	remaining: 558ms
4:	learn: 0.4570426	total: 28.8ms	remaining: 548ms
5:	learn: 0.4229577	total: 34.8ms	remaining: 545ms
6:	learn: 0.3946142	total: 40.6ms	remaining: 539ms
7:	learn: 0.3708370	total: 46.8ms	remaining: 539ms
8:	learn: 0.3496078	total: 51.8ms	remaining: 523ms
9:	learn: 0.3369240	total: 56.9ms	remaining: 512ms
10:	learn: 0.3194634	total: 62.6ms	remaining: 506ms
11:	learn: 0.3066971	total: 66.5ms	remaining: 488ms
12:	learn: 0.2960215	total: 70.5ms	remaining: 472ms
13:	learn: 0.2862050	total: 76.3ms	remaining: 469ms
14:	learn: 0.2726924	total: 81.5ms	remaining: 462ms
15:	learn: 0.2650457	total: 87.4ms	remaining: 459ms
16:	learn: 0.2578498	total: 93.3ms	remaining: 456ms
17:	learn: 0.2520021	total: 101ms	remaining: 458ms
18:	learn: 0.2470717	total: 149ms	remaining: 635ms
19:	learn: 0.2408646	total

<catboost.core.CatBoostClassifier at 0x1fd8cc002e0>

In [34]:
import joblib

In [35]:
filename = 'model_xg.sav'
joblib.dump(model_xg, filename)

['model_xg.sav']

In [36]:
filename = 'model_cat.sav'
joblib.dump(model_cat, filename)

['model_cat.sav']

In [49]:
X.to_csv('model_data.csv' , index = False)

In [37]:
# loaded_model = joblib.load(filename)

In [38]:
table = pd.DataFrame(data = df.mean() , columns = ['value'] , index = df.columns)
table

Unnamed: 0,value
account_length,100.853374
intertiol_plan,0.12154
voice_mail_plan,0.257353
number_vm_messages,7.642301
total_day_min,182.366453
total_day_calls,100.734429
total_day_charge,31.000186
total_eve_min,201.517063
total_eve_calls,100.387543
total_eve_charge,17.135285


In [39]:
table['value']['intertiol_plan'] = int(df['intertiol_plan'].mode())
table['value']['voice_mail_plan'] = int(df['voice_mail_plan'].mode())
table['value']['location_code_452'] = int(df['location_code_452'].mode())
table['value']['location_code_445'] = int(df['location_code_445'].mode())
table['value']['location_code_547'] = int(df['location_code_547'].mode())
table['value']['no_of_plans'] = int(df['no_of_plans'].mode())

In [40]:
table

Unnamed: 0,value
account_length,100.853374
intertiol_plan,0.0
voice_mail_plan,0.0
number_vm_messages,7.642301
total_day_min,182.366453
total_day_calls,100.734429
total_day_charge,31.000186
total_eve_min,201.517063
total_eve_calls,100.387543
total_eve_charge,17.135285


In [41]:
# table.to_pickle("model/table.pkl")

In [42]:
len(df.columns)

25

In [50]:
X.columns

Index(['account_length', 'intertiol_plan', 'voice_mail_plan',
       'number_vm_messages', 'total_day_min', 'total_day_calls',
       'total_day_charge', 'total_eve_min', 'total_eve_calls',
       'total_eve_charge', 'total_night_minutes', 'total_night_calls',
       'total_night_charge', 'total_intl_minutes', 'total_intl_calls',
       'total_intl_charge', 'customer_service_calls', 'location_code_452',
       'location_code_445', 'location_code_547', 'total_charge', 'total_calls',
       'total_min', 'no_of_plans'],
      dtype='object')