In [None]:
import pandas as pd
import numpy as np
import collections as cl
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
#from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_predict

from functools import partial

%matplotlib inline

In [None]:
#Load original dataset
churn_original = pd.read_excel("Churn_original.xlsx")

In [None]:
churn_original = churn_original[churn_original["TotalCharges"] != " "]
churn_original["TotalCharges"] = churn_original["TotalCharges"].astype(float)

In [None]:
#Load dataset with categorical columns
churn = pd.read_excel("Churn.xlsx")
churn = churn[churn["TotalCharges"] != " "]
churn["TotalCharges"] = churn["TotalCharges"].astype(float)

# Data Preprocessing

In [None]:
sigmaTotalCharges = churn.TotalCharges.std()
sigmaMonthlyCharges = churn.TotalCharges.std()
sigmaTenure = churn.tenure.std()
meanTotalCharges = churn.TotalCharges.mean()
meanMonthlyCharges = churn.MonthlyCharges.mean()
meanTenure = churn.tenure.mean()

In [None]:
churn.TotalCharges = churn.TotalCharges.transform(lambda x: (x - meanTotalCharges) / sigmaTotalCharges)
churn.MonthlyCharges = churn.MonthlyCharges.transform(lambda x: (x - meanMonthlyCharges) / sigmaMonthlyCharges)
churn.tenure = churn.tenure.transform(lambda x: (x - meanTenure) / meanTenure)

In [None]:
names = ["tenure", "MonthlyCharges", "TotalCharges"]
fig, ax = plt.subplots(1, 3, figsize=(15,5))
for i in range (len(names)):
    column_name = names[i]
    ax[i].set_title(column_name)
    ax[i].boxplot(np.array(churn_original[column_name]), 1, showfliers=True)
plt.tight_layout()

##### Correlations

In [None]:
churn_corr = churn.corr()

corr = []
churn_tmp = churn.copy()
churn_tmp.drop("customerID", axis =1, inplace =  True)
i, j = 0, 0

for c1 in churn.drop("customerID", axis=1).columns:
    churn_tmp.drop(c1, axis =1, inplace =  True)
    for c2 in churn_tmp.columns:
        corr.append([round(churn_corr[c1].loc[c2],4),c1,c2])
        j = j+1
    i = i+1

corr_sorted = sorted(corr, reverse=True)
corr_sorted

*We observe that all the columns "No service" contain redundant information. Indeed, the columns with values "no Internet values" are equal to the column "Internet service". That's the same for the columns "Multiple lines" and "Phone service".
Thus, we can delete the columns 'TechSupportNoService', 'StreamingTVNoService', 'StreamingMoviesNoService', 'OnlineSecurityNoService', 'OnlineBackupNoService', 'DeviceProtectionnoService' and 'MultipleLinesNoService'*

In [None]:
#Deletion of redundant columns
churn.drop(['TechSupportNoService', 'StreamingTVNoService', 'StreamingMoviesNoService', 'OnlineSecurityNoService', 'OnlineBackupNoService', 'DeviceProtectionnoService', 'MultipleLinesNoService'], axis=1, inplace=True)

In [None]:
churn_corr = churn.corr()

corr = []
churn_tmp = churn.copy()
churn_tmp.drop("customerID", axis =1, inplace =  True)
i, j = 0, 0

for c1 in churn.drop("customerID", axis=1).columns:
    churn_tmp.drop(c1, axis =1, inplace =  True)
    for c2 in churn_tmp.columns:
        corr.append([round(churn_corr[c1].loc[c2],4),c1,c2])
        j = j+1
    i = i+1

corr_sorted = sorted(corr, reverse=True)
corr_sorted

##### Note about charges

In [None]:
print(churn.tenure.multiply(churn.MonthlyCharges).corr(churn.TotalCharges))

*We observe that the column "Total Charges" is almost equal to the multiplication of the columns "Monthly Charges" and "Tenure". So we have also redundant information with those 3 columns.*

# Observation of the values in the dataset

In [None]:
for column_name in churn_original.drop("tenure", axis=1).columns[1:-3]:
    print(churn_original[str(column_name)].value_counts())
    print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
    print('\r')

# First analysis of the importance of features

### Analysis of categorical variables

In this part, we display in histograms the proportion of clients who churned for each feature. That will give us a first estimation of the importance of each feature.

In [None]:
print("Proportion of churns for each feature: (the light gray aera is the proportion of people who didn't churn)")

columns = churn_original.drop("tenure", axis=1).columns[1:-3]
churned = churn_original[churn_original.Churn == "Yes"]

color = ["grey", "blue", "orange", "purple", "pink"]

fig, ax = plt.subplots(6, 3, figsize=(20,25))

for i in range(len(columns)):
    column_name = str(columns[i])
    
    weights = np.ones(churned.shape[0])
    weights1 = np.ones(churned.shape[0])

    for value in churned[column_name].value_counts().index.values.tolist():
        weights[churned[column_name] == value] = 1/churn_original[column_name].value_counts().loc[value]
        weights1[churned[column_name] == value] = 1/churned[column_name].value_counts().loc[value]
        
    ax[int(i/3)][i%3].hist(churned[column_name], color="#f2f2f2", weights=weights1)    
    ax[int(i/3)][i%3].hist(churned[column_name], color=color[i%5], weights=weights)
                          
    ax[int(i/3)][i%3].set_title(column_name)
    ax[int(i/3)][i%3].set_xlabel("Values of the feature")
    ax[int(i/3)][i%3].set_ylabel("Proportion of churns")
    
plt.tight_layout()


*We observe a lot of things with this first analysis.<br>
First, some features to have very little influence on the churn, such as the gender, the multiple lines, the phone service.
Other features seem to be very important, like the tech support, the online security, the contract.<br><br>
3 payment methods look similar but the electronic check however leads to much more churns than the others.*

### Analysis of the tenure

In [None]:
tenure = churn_original[["tenure","Churn"]].copy()

In [None]:
tenure1 = tenure[tenure.tenure <= 6].Churn
tenure2 = tenure[np.logical_and(tenure.tenure > 6, tenure.tenure <= 20)].Churn
tenure3 = tenure[np.logical_and(tenure.tenure > 20, tenure.tenure <= 40)].Churn
tenure4 = tenure[np.logical_and(tenure.tenure > 40, tenure.tenure <= 60)].Churn
tenure5 = tenure[tenure.tenure > 60].Churn

tenures = [tenure1.values, tenure2.values, tenure3.values, tenure4.values, tenure5.values]

In [None]:
tenure['tenure6'] = pd.Series(tenure.tenure <= 6, tenure.index)
tenure['tenure6_20'] = pd.Series(np.logical_and(tenure.tenure > 6, tenure.tenure <= 20), tenure.index)
tenure['tenure20_40'] = pd.Series(np.logical_and(tenure.tenure > 20, tenure.tenure <= 40), tenure.index)
tenure['tenure40_60'] = pd.Series(np.logical_and(tenure.tenure > 40, tenure.tenure <= 60), tenure.index)
tenure['tenure60'] = pd.Series(tenure.tenure > 60, tenure.index)
tenure['tenure_step'] = tenure['tenure6'] + 2 * tenure['tenure6_20'] + 3 * tenure['tenure20_40'] + 4 * tenure['tenure40_60'] + 5 * tenure['tenure60']
tenure_step_churn = np.array(tenure[tenure.Churn == 'Yes'].tenure_step)
tenure_step_not_churn = np.array(tenure[tenure.Churn == 'No'].tenure_step)
churn_graph = [tenure_step_churn, tenure_step_not_churn]

In [None]:
fig = plt.figure(figsize=(10,5))
ax = plt.axes()

colors = ['blue', 'orange']

ax.hist(churn_graph, 5, histtype='bar', label=["churn", "no churn"],  color=colors)
ax.legend()
ax.set_xticks([1.4,2.2,3,3.8,4.6])
ax.set_xticklabels(["Tenure <= 6","6 < tenure <= 20","20 < Tenure <= 40","40 < tenure <= 60","Tenure > 60"])
ax.set_ylabel("Number of clients")

*We observe that people who are clients since a long time are less likely to churn.*

# Machine learning

Now, further analysis...

In [None]:
train = churn.iloc[0:6000].drop("customerID", axis=1)
test =  churn.iloc[6001:7000].drop("customerID", axis=1)

In [None]:
def mean_score(x):
    return np.array(x).T[0].sum() / len(x)

churn.sample(frac=1)
n = 8
chunk_size = int(len(churn) / n)
random_forest = []
random_forest_proba = [] 
log_reg = []
for i in range(n):
    test_low = i * chunk_size
    test_high = (i + 1) * chunk_size
    train_index = list(range(len(churn)))
    for j in range(test_low, test_high):
        train_index.remove(j)
    train = churn.iloc[train_index].drop("customerID", axis=1)
    test =  churn.iloc[test_low:test_high].drop("customerID", axis=1)
    random_forest.append(predict_random_forest(train, test))
    random_forest_proba.append(predict_random_forest_proba(train, test))
    log_reg.append(predict_log_reg(train, test))
(mean_score(random_forest), mean_score(random_forest_proba), mean_score(log_reg))

In [None]:
(score, predictor) = log_reg[0]
predictor(test.drop("Churn", axis=1))

In [None]:
def predict_random_forest(train, test):
    classifier = RandomForestClassifier()
    classifier.fit(train.drop("Churn", axis=1), train.Churn)
    predicted_churn = classifier.predict(test.drop("Churn", axis=1))
    classifier_score = classifier.score(test.drop("Churn", axis=1), test.Churn)
    return (classifier_score, classifier.predict)

In [None]:
def random_forest_proba_classifier(x, classifier, split_value):
    prediction = classifier.predict_proba(x)
    prediction[prediction <= split_value] = 0
    prediction[prediction > split_value] = 1
    return prediction

def predict_random_forest_proba(train, test):
    classifier = RandomForestClassifier()
    classifier.fit(train.drop("Churn", axis=1), train.Churn)
    predicted_churn = classifier.predict_proba(test.drop("Churn", axis=1))
    prediction = predicted_churn.T[1].T
    c = cl.Counter(prediction)
    a = sorted(list(c.items()))
    keys = np.array(a).T[0].T
    values = np.array(a).T[1].T
    proportion = churn.Churn.value_counts()

    counter = 0
    index=0
    while counter < (proportion[1] * sum(values) / (proportion [0] + proportion[1])):
        index += 1
        counter += values[index]
    prediction[prediction <= keys[index]] = 0
    prediction[prediction > keys[index]] = 1
    matched = (prediction - np.array(test.Churn) == 0)
    score = (prediction - np.array(test.Churn) == 0).sum()/len(test)
    return (score, partial(random_forest_proba_classifier, classifier=classifier, split_value=keys[index]))

In [None]:
def predict_log_reg(train, test):
    X_train = train.drop("Churn", axis=1)
    X_test = test.drop("Churn", axis=1)
    y_train = train.Churn
    y_test = test.Churn
    logisticRegr = LogisticRegression()
    logisticRegr.fit(X_train,y_train)
    pred = logisticRegr.predict(X_test)
    score = accuracy_score(y_test, pred)
    return (score, logisticRegr.predict)

In [None]:
X = churn.drop(['Churn','customerID'],axis = 1)
y = churn['Churn']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
logisticRegr = LogisticRegression()
logisticRegr.fit(X_train,y_train)

In [None]:
pred = logisticRegr.predict(X_test)

In [None]:
accuracy_score(y_test, pred)

In [None]:
pred_proba = logisticRegr.predict_proba(X_test)