In [57]:
#Importing Libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency
from sklearn.preprocessing import LabelEncoder

In [58]:
#Set working directory
os.chdir("D:\\edWisor\\Project-I-Final")

In [59]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from matplotlib.colors import ListedColormap
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import GaussianNB

def DataSetPreprocessing(df):
    drop_columns = ['phone number','area code','total eve minutes','total day minutes','total night minutes','total intl minutes','state']
    df = df.drop(columns=drop_columns,axis = 0)
    df = LabelEncoding(df=df,col_name='international plan')
    df = LabelEncoding(df=df,col_name='voice mail plan')
    df = LabelEncoding(df=df,col_name='Churn')
    return df

def OneHotEncoding(df,col_name):
    sub=df[col_name]
    ohe=pd.get_dummies(sub)
    ohe.columns=list(map(lambda x: col_name+" "+str(x),ohe.columns))
    df=df.drop(col_name,axis=1)
    df=pd.concat([df,ohe],axis=1)
    return df

def LabelEncoding(df,col_name):
    l=LabelEncoder()
    df[col_name]=l.fit_transform(df[col_name])
    return df

def ModelReturnFunc(modelName,X_train,y_train):
    if(modelName=='LogisticRegression'):
        classifier = ReturnLogisticRegressionModel(X_train,y_train)
    elif(modelName=='KNN'):
        classifier = ReturnKNNModel(X_train,y_train)
    elif(modelName=='NaiveBayes'):
        classifier = ReturnNaiveBayesModel(X_train,y_train)
    elif(modelName=='DecisionTreeClassifier'):
        classifier = ReturnDecisionTreeClassifierModel(X_train,y_train)
    elif(modelName=='RandomForestClassifier'):
        classifier = ReturnRandomForestClassifierModel(X_train,y_train)
    return classifier

def ReturnScaledData(X):
    sc_X = StandardScaler()
    sc_X.fit(X)
    return sc_X

def ReturnLogisticRegressionModel(X_train,y_train):
    classifier = LogisticRegression(random_state=0)
    classifier.fit(X_train, y_train)
    return classifier

def ReturnKNNModel(X_train,y_train):
    classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
    classifier.fit(X_train, y_train)
    return classifier

def ReturnDecisionTreeClassifierModel(X_train,y_train):
    classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
    classifier.fit(X_train, y_train)
    return classifier

def ReturnRandomForestClassifierModel(X_train,y_train):
    classifier = RandomForestClassifier(n_estimators = 500, criterion = 'entropy', random_state = 0)
    classifier.fit(X_train, y_train)
    return classifier

def ReturnNaiveBayesModel(X_train,y_train):
    classifier = GaussianNB()
    classifier.fit(X_train, y_train)
    return classifier

def PredictFromModel(classifier,X_test):
    y_pred = classifier.predict(X_test)
    return y_pred

def ReturnConfusionMatrix(y_pred,y_test):
    return confusion_matrix(y_test, y_pred)

def ReturnPerformanceDataFrame(df,modelName, cm):
    TN = cm[0][0]
    TP = cm[1][1]
    FP = cm[0][1]
    FN = cm[1][0]
    Total = TN + TP + FP + FN
    accuracy = (TP+TN)/Total
    sensitivity = (TP)/(TP+FN)
    precision = TP/(TP+FP)
    prevalence = (TP+FN)/Total
    flag = True
    list1 = [[modelName,accuracy,sensitivity,precision,prevalence]]
    df2 = pd.DataFrame(list1,columns=["ModelName","Accuracy","True Positive Rate","Precision","Prevalence"])
    for index, row in df.iterrows():
        if(row['ModelName']==modelName):
            flag = False
    if(flag):
        df = df.append(df2)
    df = df.reset_index(drop=True)
    return df

In [60]:
#Import dataset
dataset = pd.read_csv("Train_data.csv")
dataset_test = pd.read_csv("Test_data.csv")
performanceDF = pd.DataFrame()

In [61]:
dataset = DataSetPreprocessing(dataset)
dataset_test = DataSetPreprocessing(dataset_test)
X_train = dataset.iloc[:,:-1].values
y_train = dataset['Churn'].values
X_test = dataset_test.iloc[:,:-1].values
y_test = dataset_test['Churn'].values

In [8]:
#from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(X, y,
#                                                    stratify=y, 
#                                                    test_size=0.25)

# Logistic Regression

In [49]:
scaledObject = ReturnScaledData(X_train) 
X_train_Scaled =scaledObject.transform(X_train) 
X_test_Scaled = scaledObject.transform(X_test) 
classifier = ModelReturnFunc('LogisticRegression',X_train_Scaled,y_train) 
y_pred = PredictFromModel(classifier,X_test_Scaled) 
confusionMatrix = ReturnConfusionMatrix(y_pred,y_test)
performanceDF = ReturnPerformanceDataFrame(performanceDF,'LogisticRegression',confusionMatrix)

In [50]:
performanceDF

Unnamed: 0,ModelName,Accuracy,True Positive Rate,Precision,Prevalence
0,LogisticRegression,0.871026,0.191964,0.558442,0.134373


# KNN

In [51]:
scaledObject = ReturnScaledData(X_train) 
X_train_Scaled =scaledObject.transform(X_train) 
X_test_Scaled = scaledObject.transform(X_test) 
classifier = ModelReturnFunc('KNN',X_train_Scaled,y_train) 
y_pred = PredictFromModel(classifier,X_test_Scaled) 
confusionMatrix = ReturnConfusionMatrix(y_pred,y_test)
performanceDF = ReturnPerformanceDataFrame(performanceDF,'KNN',confusionMatrix)

# Naive Bayes

In [52]:
classifier = ModelReturnFunc('NaiveBayes',X_train,y_train) 
y_pred = PredictFromModel(classifier,X_test) 
confusionMatrix = ReturnConfusionMatrix(y_pred,y_test)
performanceDF = ReturnPerformanceDataFrame(performanceDF,'NaiveBayes',confusionMatrix)

# Decision Tree Classifier

In [53]:
classifier = ModelReturnFunc('DecisionTreeClassifier',X_train,y_train) 
y_pred = PredictFromModel(classifier,X_test) 
confusionMatrix = ReturnConfusionMatrix(y_pred,y_test)
performanceDF = ReturnPerformanceDataFrame(performanceDF,'DecisionTreeClassifier',confusionMatrix)

# Random Forest Classsifier

In [63]:
classifier = ModelReturnFunc('RandomForestClassifier',X_train,y_train) 
y_pred = PredictFromModel(classifier,X_test)
Churn_predicted = y_pred
confusionMatrix = ReturnConfusionMatrix(y_pred,y_test)
performanceDF = ReturnPerformanceDataFrame(performanceDF,'RandomForestClassifier',confusionMatrix)
performanceDF.reset_index(drop=True)

Unnamed: 0,ModelName,Accuracy,True Positive Rate,Precision,Prevalence
0,RandomForestClassifier,0.961008,0.727679,0.976048,0.134373


In [74]:
Churn_predicted = pd.DataFrame(Churn_predicted)
Churn_predicted.columns = ['Churn']
Churn_predicted.Churn[Churn_predicted.Churn == 0] = ' False.'
Churn_predicted.Churn[Churn_predicted.Churn == 1] = ' True.'
#Churn_predicted = Churn_predicted['Churn'].replace('0',' False.')
#Churn_predicted = Churn_predicted['Churn'].replace('1',' True.')
Churn_predicted.to_csv("Churn_Predicted (Python).csv",index=False)

In [56]:
performanceDF.to_csv("Performance of models in the given Test_data (Python).csv",index=False)