In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline

In [None]:
#Load original dataset
churn_original = pd.read_excel("Churn_original.xlsx")

In [None]:
#Load dataset with categorical columns
churn = pd.read_excel("Churn.xlsx")
churn = churn[churn["TotalCharges"] != " "]
churn["TotalCharges"] = churn["TotalCharges"].astype(float)

# dataset: study outliers, etc.

# Observation of the values in the dataset

In [None]:
for column_name in churn_original.drop("tenure", axis=1).columns[1:-3]:
    print(churn_original[str(column_name)].value_counts())
    print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
    print('\r')

# First analysis of the importance of features

### Analysis of categorical variables

In this part, we display in histograms the proportion of clients who churned for each feature. That will give us a first estimation of the importance of each feature.

In [None]:
print("Proportion of churns for each feature: (the light gray aera is the proportion of people who didn't churn)")

columns = churn_original.drop("tenure", axis=1).columns[1:-3]
churned = churn_original[churn_original.Churn == "Yes"]

color = ["grey", "blue", "orange", "purple", "pink"]

fig, ax = plt.subplots(6, 3, figsize=(20,25))

for i in range(len(columns)):
    column_name = str(columns[i])
    
    weights = np.ones(churned.shape[0])
    weights1 = np.ones(churned.shape[0])

    for value in churned[column_name].value_counts().index.values.tolist():
        weights[churned[column_name] == value] = 1/churn_original[column_name].value_counts().loc[value]
        weights1[churned[column_name] == value] = 1/churned[column_name].value_counts().loc[value]
        
    ax[int(i/3)][i%3].hist(churned[column_name], color="#f2f2f2", weights=weights1)    
    ax[int(i/3)][i%3].hist(churned[column_name], color=color[i%5], weights=weights)
                          
    ax[int(i/3)][i%3].set_title(column_name)
    ax[int(i/3)][i%3].set_xlabel("Values of the feature")
    ax[int(i/3)][i%3].set_ylabel("Proportion of churns")
    
plt.tight_layout()


*We observe a lot of things with this first analysis.<br>
First, some features to have very little influence on the churn, such as the gender, the multiple lines, the phone service.
Other features seem to be very important, like the tech support, the online security, the contract.<br><br>
3 payment methods look similar but the electronic check however leads to much more churns than the others.*

### Analysis of the tenure

In [None]:
tenure = churn[["tenure","Churn"]].copy()

In [None]:
tenure1 = tenure[tenure.tenure <= 6].Churn
tenure2 = tenure[np.logical_and(tenure.tenure > 6,tenure.tenure <= 20)].Churn
tenure3 = tenure[np.logical_and(tenure.tenure > 20,tenure.tenure <= 40)].Churn
tenure4 = tenure[np.logical_and(tenure.tenure > 40,tenure.tenure <= 60)].Churn
tenure5 = tenure[tenure.tenure > 60].Churn

tenures = [tenure1.values, tenure2.values, tenure3.values, tenure4.values, tenure5.values]

In [None]:
fig = plt.figure(figsize=(14,5))
ax = plt.axes()

ax.hist(tenures, label=["Tenure <= 6","6 < tenure <= 20","20 < Tenure <= 40","40 < tenure <= 60","Tenure > 60"])
ax.legend()
ax.set_xticks([0,1])
ax.set_xticklabels(["Didn't churn","Churned"])
ax.set_ylabel("Number of clients")

*We observe that people who are clients since a long time are less likely to churn.*

# Machine learning

Now, further analysis...

In [None]:
train = churn.iloc[0:6000].drop("customerID", axis=1)
test =  churn.iloc[6001:7000].drop("customerID", axis=1)

In [None]:
classifier = RandomForestClassifier()
classifier.fit(train.drop("Churn", axis=1), train.Churn)

predicted_churn = classifier.predict(test.drop("Churn", axis=1))

In [None]:
matched = predicted_churn == test.Churn

In [None]:
plt.hist(matched)

Using all the features available, we see we can predict 80% of churn with this model of random forest

> See what features matter the most