In [None]:
"""# Import Libraries"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

In [None]:
#ignore warning messages 
import warnings
warnings.filterwarnings('ignore') 

In [None]:
"""# Import Datasets"""
df_train = pd.read_csv('/kaggle/input/cs-3110-mini-project/train.csv')
df_test = pd.read_csv('/kaggle/input/cs-3110-mini-project/test.csv')

In [None]:

"""# Define Functions"""

def set_outliers_to_nan(p , feature_boundries):
  df = p.copy()

  for key in feature_boundries.keys():
    feature = key
    lower_bound = feature_boundries[key][0]
    upper_bound = feature_boundries[key][1]
    print(feature , lower_bound , upper_bound)
    df.loc[df[feature] < lower_bound , feature] = np.nan
    df.loc[df[feature] > upper_bound, feature] = np.nan
  return df

def isNaN(num):
    if float('-inf') < float(num) < float('inf'):
        return False
    else:
        return True

def linear_regression_imputation_to_nan(p, corr_features):
    df = p.copy()
    for c_f in corr_features:
        # impute median for both same index values are nan
        f1 = c_f[0]
        f2 = c_f[1]
        df.loc[df[f1].isnull() & df[f2].isnull(), f1] = df[f1].median()
        df.loc[df[f1].isnull() & df[f2].isnull(), f2] = df[f2].median()

        model1 = LinearRegression()
        model2 = LinearRegression()
        data = df.dropna(subset=c_f)
        
        data_f1 = data[[f1]]
        data_f2 = data[[f2]]
      
        model1.fit(data_f1, data_f2)
        model2.fit(data_f2, data_f1)
        
        for i in df.index:
            if isNaN(df[f1][i]):
                x = np.array([[df[f2][i]]])
                df[f1][i] = model2.predict(x)[0][0]

            elif isNaN(df[f2][i]):
                x = np.array([[df[f1][i]]])

                df[f2][i] = model1.predict(x)[0][0]

    return df

def categorical_imputation_by_most_frequent_value(p, features):
    df = p.copy()
    for f in features:
        frq_val = df_train[~df_train[f].isnull()][f].value_counts().idxmax()
        df[f] = df[f].fillna(frq_val)
    return df

def nan_imputation_by_median(p, features):
    df = p.copy()
    for feature in features:
        df.loc[df[feature].isnull(), feature] = df[feature].median()
    return df

def show_correlations(dataframe, show_chart = True):
    fig = plt.figure(figsize = (20,10))
    corr = dataframe.corr()
    if show_chart == True:
        sns.heatmap(corr, 
                    xticklabels=corr.columns.values,
                    yticklabels=corr.columns.values,
                    annot=True)
    return corr


In [None]:
"""# Define Variables"""

cat_col = ['location_code' , 'intertiol_plan' , 'voice_mail_plan' , 'Churn']

num_col = ['account_length',
             'number_vm_messages',
             'total_day_min',
             'total_day_calls',
             'total_day_charge',
             'total_eve_min',
             'total_eve_calls',
             'total_eve_charge',
             'total_night_minutes',
             'total_night_calls',
             'total_night_charge',
             'total_intl_minutes',
             'total_intl_calls',
             'total_intl_charge',
             'customer_service_calls']

feature_boundries = {'account_length': [0, 210],
                    'number_vm_messages': [0, 50],
                    'total_day_min': [0, 350],
                    'total_day_calls': [0, 160],
                    'total_day_charge': [0, 60],
                    'total_eve_min': [0, 400],
                    'total_eve_calls': [40, 170],
                    'total_eve_charge': [3, 31],
                    'total_night_minutes': [23, 400],
                    'total_night_calls': [30, 175],
                    'total_night_charge': [0, 20],
                    'total_intl_minutes': [0, 20],
                    'total_intl_calls': [0, 18],
                    'total_intl_charge': [0, 5],
                    'customer_service_calls': [0, 9]}

correlated_feature_couple = [['total_eve_charge', 'total_eve_min'],
                   ['total_night_charge', 'total_night_minutes'],
                   ['total_intl_charge', 'total_intl_minutes'],
                   ['total_day_charge', 'total_day_min']]

cat_col1 = ['location_code' , 'intertiol_plan' , 'voice_mail_plan']

non_corr_numer_col = ['account_length',
             'number_vm_messages',
             'total_day_calls',
             'total_eve_calls',
             'total_night_calls',
             'total_intl_calls',
             'customer_service_calls']

In [None]:
"""# Drop Unwanted Columns"""

df_train = df_train.drop(columns=['Unnamed: 20'] , errors='ignore')

df_test = df_test.drop(columns=['Unnamed: 20' , 'Unnamed: 19'] , errors='ignore')

In [None]:
"""# Drop Duplicates"""

df1 = df_train.drop(columns=['customer_id'])

duplicates = df1.duplicated()

i = 0
dup_index = []
while i<=2320:
  if(duplicates[i]): dup_index.append(i)
  i += 1

df_train = df_train.drop(labels=dup_index, axis=0)

In [None]:
"""# Handle Invalid Data"""

for i in num_col:
  df_train[i] = np.where(df_train[i] < 0, np.NaN , df_train[i])

for i in num_col:
  df_test[i] = np.where(df_test[i] < 0, np.NaN , df_test[i])

In [None]:
"""# Handle Outliers"""

df_train = set_outliers_to_nan(df_train , feature_boundries)

df_test = set_outliers_to_nan(df_test , feature_boundries)

In [None]:
"""# Handle Missing Values"""

df_train = linear_regression_imputation_to_nan(df_train , correlated_feature_couple)

df_train = categorical_imputation_by_most_frequent_value(df_train , cat_col1)

df_train = nan_imputation_by_median(df_train , non_corr_numer_col)

df_train = df_train[~df_train['Churn'].isnull()]

df_test = linear_regression_imputation_to_nan(df_test , correlated_feature_couple)

df_test = categorical_imputation_by_most_frequent_value(df_test , cat_col1)

df_test = nan_imputation_by_median(df_test , non_corr_numer_col)


In [None]:
df_train_onehot = df_train.copy()
df_test_onehot = df_test.copy()

In [None]:
enc = OneHotEncoder(handle_unknown='ignore')
enc_df_train = pd.DataFrame(enc.fit_transform(df_train_onehot[['location_code']]).toarray())

In [None]:
enc_df_test = pd.DataFrame(enc.fit_transform(df_test_onehot[['location_code']]).toarray())

In [None]:
onehot_col = ['location_code_452' , 'location_code_445' ,'location_code_547']

In [None]:
df_train_onehot.insert(2 , 'location_code_452' , enc_df_train[0].values , False)
df_train_onehot.insert(3 , 'location_code_445' , enc_df_train[1].values , False)
df_train_onehot.insert(4 , 'location_code_547' , enc_df_train[2].values , False)

In [None]:
df_test_onehot.insert(2 , 'location_code_452' , enc_df_test[0].values , False)
df_test_onehot.insert(3 , 'location_code_445' , enc_df_test[1].values , False)
df_test_onehot.insert(4 , 'location_code_547' , enc_df_test[2].values , False)

In [None]:
df_train[onehot_col] = df_train_onehot[onehot_col]

In [None]:
df_test[onehot_col] = df_test_onehot[onehot_col]

In [None]:
df_train = df_train.drop(columns=['location_code'])

In [None]:
df_test = df_test.drop(columns=['location_code'])

In [None]:
df_train.voice_mail_plan = df_train.voice_mail_plan.map(dict(yes=1, no=0))
df_train.intertiol_plan = df_train.intertiol_plan.map(dict(yes=1, no=0))
df_train.Churn = df_train.Churn.map(dict(Yes=1, No=0))

In [None]:
df_test.voice_mail_plan = df_test.voice_mail_plan.map(dict(yes=1, no=0))
df_test.intertiol_plan = df_test.intertiol_plan.map(dict(yes=1, no=0))

In [None]:
col = list(df_train.columns)
col.remove('customer_id')
col.remove('Churn')
col

In [None]:
correlation_df = show_correlations(df_train[col],show_chart=True)

In [None]:
# from sklearn.decomposition import PCA
# pca = PCA(n_components=1)

# pca.fit(df_train[['number_vm_messages' , 'voice_mail_plan']])
# X_pca5 = pca.transform(df_train[['number_vm_messages' , 'voice_mail_plan']])
# df_5p = pd.DataFrame(data = X_pca5 , columns=['PCA1'] , index=df_train.index)

# pca.fit(df_test[['number_vm_messages' , 'voice_mail_plan']])
# X_pca55 = pca.transform(df_test[['number_vm_messages' , 'voice_mail_plan']])
# df_55p = pd.DataFrame(data = X_pca55 , columns=['PCA1'] , index=df_test.index)

# df_train['PC1'] = df_5p['PCA1']
# df_test['PC1'] = df_55p['PCA1']

In [None]:
# df_train['total_intl_charge_per_min'] = df_train['total_intl_charge'] / df_train['total_intl_minutes']
# df_test['total_intl_charge_per_min'] = df_test['total_intl_charge'] / df_test['total_intl_minutes']

In [None]:
# df_train['total_night_charge_per_min']= df_train['total_night_charge'] / df_train['total_night_minutes']
# df_test['total_night_charge_per_min'] = df_test['total_night_charge'] / df_test['total_night_minutes']

In [None]:
# df_train['total_eve_charge_per_min'] = df_train['total_eve_charge'] / df_train['total_eve_min']
# df_test['total_eve_charge_per_min'] = df_test['total_eve_charge'] / df_test['total_eve_min']

In [None]:
# df_train['total_day_charge_per_min'] = df_train['total_day_charge'] / df_train['total_day_min']
# df_test['total_day_charge_per_min'] = df_test['total_day_charge'] / df_test['total_day_min']

In [None]:
# df_train['total_intl_charge_per_min'] = df_train['total_intl_charge_per_min'].fillna(0)
# df_test['total_intl_charge_per_min'] = df_test['total_intl_charge_per_min'].fillna(0)

# df_train['total_night_charge_per_min'] = df_train['total_night_charge_per_min'].fillna(0)
# df_test['total_night_charge_per_min'] = df_test['total_night_charge_per_min'].fillna(0)

# df_train['total_eve_charge_per_min']  = df_train['total_eve_charge_per_min'].fillna(0)
# df_test['total_eve_charge_per_min'] = df_test['total_eve_charge_per_min'].fillna(0)

# df_train['total_day_charge_per_min'] = df_train['total_day_charge_per_min'].fillna(0)
# df_test['total_day_charge_per_min'] = df_test['total_day_charge_per_min'].fillna(0)

In [None]:
# df_train = df_train.drop(columns=['total_intl_charge' , 'total_intl_minutes' , 'total_night_charge' , 'total_night_minutes' ,'total_eve_charge' , 'total_eve_min' , 'total_day_charge' , 'total_day_min' , 'number_vm_messages'])

In [None]:
# df_test = df_test.drop(columns=['total_intl_charge' , 'total_intl_minutes' , 'total_night_charge' , 'total_night_minutes' ,'total_eve_charge' , 'total_eve_min' , 'total_day_charge' , 'total_day_min' , 'number_vm_messages'])

In [None]:
df_train['total_charge'] = df_train['total_intl_charge'] + df_train['total_night_charge'] + df_train[
    'total_eve_charge'] + df_train['total_day_charge']

df_train['total_calls'] = df_train['total_intl_calls'] + df_train['total_night_calls'] + df_train['total_eve_calls'] + \
                          df_train['total_day_calls']

df_train['total_min'] = df_train['total_intl_minutes'] + df_train['total_night_minutes'] + df_train['total_eve_min'] + \
                        df_train['total_day_min']

df_train["no_of_plans"] = df_train['intertiol_plan'] + df_train['voice_mail_plan']

In [None]:
df_test['total_charge'] = df_test['total_intl_charge'] + df_test['total_night_charge'] + df_test[
    'total_eve_charge'] + df_test['total_day_charge']

df_test['total_calls'] = df_test['total_intl_calls'] + df_test['total_night_calls'] + df_test['total_eve_calls'] + \
                          df_test['total_day_calls']

df_test['total_min'] = df_test['total_intl_minutes'] + df_test['total_night_minutes'] + df_test['total_eve_min'] + \
                        df_test['total_day_min']

df_test["no_of_plans"] = df_test['intertiol_plan'] + df_test['voice_mail_plan']

In [None]:
df_train = df_train.drop(columns=['customer_id'])
df_test = df_test.drop(columns=['customer_id'])

In [None]:
df_train.isnull().sum()

In [None]:
df_test.isnull().sum()

In [None]:
df_test

In [None]:
correlation_df = show_correlations(df_train,show_chart=True)

In [None]:
df_train.columns

In [None]:
# df_train = df_train.drop(columns=['total_day_min' ,'number_vm_messages' , 'total_eve_min' ,  'total_night_minutes' , 'total_intl_minutes'] , errors='ignore')
# df_test = df_test.drop(columns=['total_day_min' ,'number_vm_messages' , 'total_eve_min' ,  'total_night_minutes' , 'total_intl_minutes'] , errors='ignore')

In [None]:
# Models

In [None]:
# y = df_train['Churn']
# X = df_train.drop(columns = ['Churn'])

In [None]:
df_over = df_train.copy()

In [None]:
count_class_0, count_class_1 = df_over.Churn.value_counts()

# Divide by class
df_class_0 = df_over[df_over['Churn'] == 0]
df_class_1 = df_over[df_over['Churn'] == 1]

In [None]:
df_class_1_over = df_class_1.sample(count_class_0, replace=True)
df_train_over = pd.concat([df_class_0, df_class_1_over], axis=0)

print('Random over-sampling:')
print(df_train_over.Churn.value_counts())

In [None]:
X = df_train_over.drop('Churn',axis='columns')
y = df_train_over['Churn']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [None]:
X_train_new = np.nan_to_num(X_train.astype(np.float32))
X_test_new = np.nan_to_num(X_test.astype(np.float32))

In [None]:
## Random Forest model

In [None]:
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier(n_estimators=1000 , oob_score = True, n_jobs = -1,
                                  random_state =50, max_features = "auto",
                                  max_leaf_nodes = 30)
X_train_new = np.nan_to_num(X_train.astype(np.float32))
X_test_new = np.nan_to_num(X_test.astype(np.float32))

model_rf.fit(X_train_new, y_train)

# Make predictions
prediction_test = model_rf.predict(X_test_new)
print (metrics.accuracy_score(y_test, prediction_test))

In [None]:
# cm = confusion_matrix(y_test, prediction_test)
# plt.figure(figsize=(7,5))
# sns.heatmap(cm, annot=True)
# plt.xlabel('Predicted')
# plt.ylabel('Truth')

In [None]:
## XG Boost

In [None]:
from xgboost import XGBClassifier
model3 = XGBClassifier()
model3.fit(X_train_new, y_train)
preds = model3.predict(X_test_new)
metrics.accuracy_score(y_test, preds)

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
## CatBoost

In [None]:
from catboost import CatBoostClassifier


model4 = CatBoostClassifier(
    iterations=1000, 
    learning_rate=0.05, 
    #loss_function='CrossEntropy'
)


model4.fit(X_train_new, y_train)

preds = model4.predict(X_test_new)
#metrics.accuracy_score(y_test, preds)

mean_squared_error(y_test, preds)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
cf_matrix = confusion_matrix(y_test, preds)

In [None]:
group_names = ['True Neg','False Pos','False Neg','True Pos']

In [None]:
group_counts = ["{0:0.0f}".format(value) for value in cf_matrix.flatten()]


In [None]:
group_percentages = ["{0:.2%}".format(value) for value in cf_matrix.flatten()/np.sum(cf_matrix)]


In [None]:
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
          zip(group_names,group_counts,group_percentages)]


In [None]:
labels = np.asarray(labels).reshape(2,2)


In [None]:
sns.heatmap(cf_matrix, annot=labels, fmt='', cmap='Blues')

In [None]:
## CatBoost is selected as final model

In [None]:
X_new = np.nan_to_num(X.astype(np.float32))
X_test = np.nan_to_num(df_test.astype(np.float32))

In [None]:
X.columns

In [None]:
def submission(y_preds , name):
    df_sub = pd.read_csv('/kaggle/input/cs-3110-mini-project/test.csv')[['customer_id']]
    df_sub['Churn'] = y_preds

    df_sub[df_sub['Churn']==1] = 'Yes'
    df_sub[df_sub['Churn']==0] = 'No'

    df_sub['customer_id'] = pd.read_csv('/kaggle/input/cs-3110-mini-project/test.csv')[['customer_id']]['customer_id']

    file_name = "sample_submission" + name + ".csv"

    df_sub.to_csv(file_name , index=False)

In [None]:
from catboost import CatBoostClassifier


model4 = CatBoostClassifier(
    iterations=2500, 
    learning_rate=0.05, 
    depth=9
    #loss_function='CrossEntropy'
)


model4.fit(X_new, y)

preds = model4.predict(X_test)

In [None]:
df_sub = pd.read_csv('/kaggle/input/cs-3110-mini-project/test.csv')[['customer_id']]

In [None]:
df_sub['Churn'] = preds

In [None]:
df_sub[df_sub['Churn']==1] = 'Yes'
df_sub[df_sub['Churn']==0] = 'No'

In [None]:
df_sub['customer_id'] = pd.read_csv('/kaggle/input/cs-3110-mini-project/test.csv')[['customer_id']]['customer_id']

In [None]:
df_sub.to_csv('sample_submission4.csv' , index=False)

In [None]:
df_sub

In [None]:
model_params = {
     'catboost': {
        'model': CatBoostClassifier(),
        'params' : {
            'depth':[8 , 9 , 10],
            'iterations':[2000 , 3000 , 4000],
            'learning_rate':[0.01 , 0.05 , 0.1]
        }
    }
}

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
scores = []

for model_name, mp in model_params.items():
    print(model_name)
    clf =  GridSearchCV(mp['model'], mp['params'], cv=3, return_train_score=False)
    clf.fit(X_new,y)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    print('done ' , model_name)
    
df_tun = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df_tun

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
X = df_train.drop('Churn',axis='columns')
y = df_train['Churn']

In [None]:
clf_over = GradientBoostingClassifier(n_estimators=400, learning_rate=0.01,max_depth=13)

clf_over.fit(X, y)

y_preds = clf_over.predict(df_test)
submission(y_preds , '_over3')

In [None]:
df_scale_train = df_train.copy()
df_scale_test = df_test.copy()

In [None]:
#feature Scaling  
from sklearn.preprocessing import StandardScaler 

In [None]:
st_x= StandardScaler()   

In [None]:
df_scale_train= st_x.fit_transform(X)    
df_scale_test= st_x.transform(df_test)

In [None]:
clf_over = GradientBoostingClassifier(n_estimators=400, learning_rate=0.01,max_depth=13)

clf_over.fit(df_scale_train, y)

y_preds = clf_over.predict(df_scale_test)
submission(y_preds , '_scale1')

In [None]:
from catboost import CatBoostClassifier


model4 = CatBoostClassifier(
    iterations=1500, 
    learning_rate=0.01, 
    depth=13
    #loss_function='CrossEntropy'
)


model4.fit(df_scale_train, y)

preds = model4.predict(df_scale_test)
submission(preds , '_scale6')

In [None]:
from sklearn.neighbors import KNeighborsClassifier  
classifier= KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2 )  
classifier.fit(X_train, y_train)

#Predicting the test set result  
y_preds= classifier.predict(X_test)
#submission(preds , '_scale3')
mean_squared_error(y_test, y_preds)

In [None]:
from sklearn.svm import SVC # "Support vector classifier"  
classifier = SVC(kernel='linear', random_state=0)  
classifier.fit(X_train, y_train)

y_preds= classifier.predict(X_test)
#submission(preds , '_scale4')
mean_squared_error(y_test, y_preds)