<center><img src='churn.png' alt='' /></center>

## Loading libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mutual_info_score
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

## Data Reading

In [None]:
data = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [None]:
data.head()

In [None]:
data.columns

In [None]:
data.info()

In [None]:
for col in data.columns:
    if data.dtypes[col] == 'object':
        print('Col: {} - type {} - unique values: {}'.format(col,data.dtypes[col],data[col].unique()))

- **we observe that the column TotalCharges was wrongly detected as an object. This column represents the total amount charged to the customer, so we are going to convert it to a numeric variable.**

In [None]:
data.TotalCharges = pd.to_numeric(data.TotalCharges,errors='coerce')

In [None]:
data.isnull().sum()

- **We can now observe that the column TotalCharges has 11 missing values.**

In [None]:
data[data.TotalCharges.isnull()]

- **These observations have also a tenure of 0, even though MontlyCharges is not null for these entries. This information appeared to be contradictory, and therefore, we decide to remove those observations from the data set.**

In [None]:
data.dropna(inplace=True)

In [None]:
data.isnull().sum()

In [None]:
data.drop(columns='customerID',inplace=True)

In [None]:
def categoriesFeatures(data):
    for col in data.columns:
        if data.dtypes[col] == 'object':
            print('Col: {} - Values: {}'.format(col, data[col].unique()))

In [None]:
categoriesFeatures(data)

In [None]:
data.PaymentMethod = data.PaymentMethod.apply(lambda x: x.replace(' (automatic)',''))

In [None]:
data.replace('No internet service','No',inplace=True)
data.replace('No phone service','No',inplace=True)

In [None]:
categoriesFeatures(data)

## Data Visualization

In [None]:
plt.figure(figsize=(12,6))
ax = sns.countplot(data=data, x='Churn', palette='pastel')
plt.title('Proportion of observation of the response variable')
plt.xlabel('Churn')
plt.ylabel('Proportion of observation')
for p in ax.patches:
        ax.annotate('%{:.1f}'.format(100*p.get_height()/len(data)), (p.get_x()+0.3, p.get_height()+50))

- **As shown above, this is an imbalanced data set because both classes are not equally distributed among all observations, being no the majority class (73.4%).**

In [None]:
def countplot_of_a_feature(col):
    plt.figure(figsize=(12,6))
    ax = sns.countplot(data=data, x=col, hue='Churn', palette='pastel')
    plt.title('Proportion of observation of the ' + col + ' variable')
    plt.xlabel(col)
    plt.ylabel('Proportion of observation')
    for p in ax.patches:
            ax.annotate('%{:.1f}'.format(100*p.get_height()/len(data)), (p.get_x()+0.15, p.get_height()+20))

In [None]:
demographic_attributes = ['Partner',
            'Dependents',
            'SeniorCitizen',
            'gender'
           ]

In [None]:
for col in demographic_attributes:
    countplot_of_a_feature(col)

**We can extract the following conclusions by analyzing demographic attributes:**
- **Customers with a partner churn less than customers with no partner.**
- **Customers with a dependents churn less than customers with no dependents.**
- **The churn rate of senior citizens is almost double that of young citizens.**
- **A similar percentage of churn is shown both when a customer is a man or a woman.**

In [None]:
def pieplot_of_a_Feature(col,labels,title):
    plt.figure(figsize=(12,6))
    colors = sns.color_palette('pastel')[0:2]
    plt.pie(data[col].value_counts()*100/len(data),labels=labels,colors=colors,autopct='%.1f%%')
    plt.title(title)
    plt.show()

In [None]:
services = ['PhoneService',
            'MultipleLines',
            'InternetService',
            'OnlineSecurity',
            'OnlineBackup',
            'DeviceProtection',
            'TechSupport',
            'StreamingTV',
            'StreamingMovies'
           ]

In [None]:
for service in services:
    countplot_of_a_feature(service)

**We can extract the following conclusions by evaluating services attributes:**
- **Customers with no phone service churn less than customers with phone service.**
- **The percentage of churn for those with MultipleLines is nearly the same.**
- **Customers with DSL and no intsernet service churn less than those with fibre optic.**
- **Clients with online security, online backup and DeviceProtection churn less than those without it.**
- **Customers with no tech support tend to churn more often than those with tech support.**
- **The percentage of churn for those with StreamingTV and StreamingMovies is nearly the same.**

In [None]:
categories_account_attributes = ['Contract', 'PaperlessBilling','PaymentMethod']
for col in categories_account_attributes:
    countplot_of_a_feature(col)

**We can extract the following conclusions by analyzing customer account attributes:**

- **Customers with month-to-month contracts have higher churn rates compared to clients with yearly contracts.**
- **Customers subscribed to paperless billing churn more than those who are not subscribed.**
- **Customers who opted for an electronic check as paying method are more likely to leave the company.**

In [None]:
def hist_plot(col):
    sns.displot(data=data,x=col,hue='Churn')
    plt.title('Distribution of ' + col + ' by churn')
    plt.xlabel(col)
    plt.ylabel('Number of customers')
    plt.show()

In [None]:
numericals_account_attributes = ['tenure', 'MonthlyCharges', 'TotalCharges']
for col in numericals_account_attributes:
    hist_plot(col)

**We can extract the following conclusions by analyzing the histograms above:**
- **New customers (low tenure) are more likely to churn.**
- **The churn rate tends to be larger when monthly charges are high.**
- **Clients with high total charges are less likely to leave the company.**

## Feature importance

In [None]:
def compute_mutual_information(categorical_serie):
    return mutual_info_score(categorical_serie, data.Churn)

categorical_variables = data.select_dtypes(include=object).drop('Churn', axis=1)
feature_importance = categorical_variables.apply(compute_mutual_information).sort_values(ascending=False)
print(feature_importance)

In [None]:
plt.figure(figsize=(15,5))
plt.title("Feature importance")
ax = sns.barplot(y=feature_importance.index, x=feature_importance.values, palette="Blues_d", orient='h')

- **As shown above, gender, PhoneService, and MultipleLines have a mutual information score really close to 0, meaning those variables do not have a strong relationship with the target.**

## Feature Engineering

In [None]:
data_transformed = data.copy()

label_encoding_columns = ['gender',
                          'Partner',
                          'Dependents',
                          'PhoneService',
                          'MultipleLines',
                          'OnlineSecurity',
                          'OnlineBackup',
                          'DeviceProtection',
                          'TechSupport',
                          'StreamingTV',
                          'StreamingMovies',
                          'PaperlessBilling',
                          'Churn']
one_hot_encoding_columns = ['InternetService', 'Contract', 'PaymentMethod']

for column in label_encoding_columns:
    if column == 'gender':
        data_transformed[column] = data_transformed[column].map({'Female': 1, 'Male': 0})
    else: 
        data_transformed[column] = data_transformed[column].map({'Yes': 1, 'No': 0})

# encode categorical variables with more than two levels using one-hot encoding
data_transformed = pd.get_dummies(data_transformed, columns = one_hot_encoding_columns)

In [None]:
# min-max normalization
min_max_columns = ['tenure', 'MonthlyCharges', 'TotalCharges']

for column in min_max_columns:
        min_column = data_transformed[column].min()
        max_column = data_transformed[column].max()
        data_transformed[column] = (data_transformed[column] - min_column) / (max_column - min_column)


## Machine Learning Modeling

In [None]:
X = data_transformed.drop(columns='Churn')
y = data_transformed.loc[:, 'Churn']

In [None]:
print(X.columns)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=101)

In [None]:
seed = 2
models = [
    ('dummy_classifier', DummyClassifier(random_state=seed, strategy='most_frequent')),
    ('k_nearest_neighbors', KNeighborsClassifier()),
    ('logistic_regression', LogisticRegression(random_state=seed)),
    ('support_vector_machines', SVC(random_state=seed)),
    ('random_forest', RandomForestClassifier(random_state=seed)),
    ('gradient_boosting', GradientBoostingClassifier(random_state=seed))
    ]

In [None]:
results = []
names = []
for name, model in models:
    # fit the model with the training data
    model.fit(X_train, y_train)
    # make predictions with the testing data
    predictions = model.predict(X_test)
    # calculate accuracy 
    accuracy = accuracy_score(y_test, predictions)
    # append the model name and the accuracy to the lists
    results.append(accuracy)
    names.append(name)
    # print classifier accuracy
    print('Classifier: {}, Accuracy: {})'.format(name, accuracy))

In [None]:
# define the parameter grid
grid_parameters = {'n_estimators': [80, 90, 100, 110, 115, 120],
                   'max_depth': [3, 4, 5, 6],
                   'max_features': [None, 'auto', 'sqrt', 'log2'], 
                   'min_samples_split': [2, 3, 4, 5]}


# define the RandomizedSearchCV class for trying different parameter combinations
random_search = RandomizedSearchCV(estimator=GradientBoostingClassifier(),
                                   param_distributions=grid_parameters,
                                   cv=5,
                                   n_iter=150,
                                   n_jobs=-1)

# fitting the model for random search 
random_search.fit(X_train, y_train)

# print best parameter after tuning
print(random_search.best_params_)

In [None]:
# make the predictions
random_search_predictions = random_search.predict(X_test)

# construct the confusion matrix
cf_matrix = confusion_matrix(y_test, random_search_predictions)

# visualize the confusion matrix
print(cf_matrix )

In [None]:
plt.figure(figsize=(4,3))
sns.heatmap(cf_matrix,annot=True,fmt = "d",linecolor="k",linewidths=3)

- **As shown above, 1442 observations of the testing data were correctly classified by the model (1215 true negatives and 227 true positives). On the contrary, we can observe 316 misclassifications (114 false positives and 202 false negatives).**

In [None]:
print(classification_report(y_test, random_search_predictions))

In [None]:
accuracy_score(y_test, random_search_predictions)