In [47]:
# import the modules
import pandas as pd
import numpy as np
import os 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import r2_score, roc_auc_score, accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler

### Load the dataset

- Load the train data and using all your knowledge try to explore the different statistical properties of the dataset.

In [48]:
# Code starts here
train = pd.read_csv('train.csv')
print(train.shape)
train.head()

(5634, 22)


Unnamed: 0,Id,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,1370,7596-IIWYC,Female,0,No,No,27,Yes,No,No,...,No internet service,No internet service,No internet service,No internet service,Two year,Yes,Bank transfer (automatic),20.25,538.2,No
1,5676,9103-CXVOK,Male,0,Yes,Yes,1,Yes,No,No,...,No internet service,No internet service,No internet service,No internet service,Month-to-month,No,Electronic check,19.75,19.75,No
2,5800,7129-CAKJW,Female,0,No,No,17,Yes,Yes,Fiber optic,...,Yes,No,No,No,Month-to-month,No,Bank transfer (automatic),80.05,1345.65,No
3,1645,9490-DFPMD,Female,1,No,No,42,Yes,Yes,Fiber optic,...,No,No,Yes,No,Month-to-month,Yes,Electronic check,84.65,3541.35,Yes
4,366,9069-LGEUL,Male,0,Yes,No,23,Yes,No,DSL,...,No,No,No,Yes,Month-to-month,Yes,Bank transfer (automatic),59.95,1406.0,No


In [49]:
print(train.nunique())

Id                  5634
customerID          5634
gender                 2
SeniorCitizen          2
Partner                2
Dependents             2
tenure                73
PhoneService           2
MultipleLines          3
InternetService        3
OnlineSecurity         3
OnlineBackup           3
DeviceProtection       3
TechSupport            3
StreamingTV            3
StreamingMovies        3
Contract               3
PaperlessBilling       2
PaymentMethod          4
MonthlyCharges      1482
TotalCharges        5275
Churn                  2
dtype: int64


In [50]:
train['gender'] = train['gender'].map({'Male': 1, 'Female': 0})
train['Partner'] = train['Partner'].map({'Yes': 1, 'No': 0})
train['Dependents'] = train['Dependents'].map({'Yes': 1, 'No': 0})
train['PhoneService'] = train['PhoneService'].map({'Yes': 1, 'No': 0})
train['MultipleLines'] = train['MultipleLines'].map({'Yes': 2, 'No': 1, 'No phone service':0})
train['InternetService'] = train['InternetService'].map({'Fiber optic': 2, 'DSL': 1, 'No':0})
train['OnlineSecurity'] = train['OnlineSecurity'].map({'Yes': 2, 'No': 1, 'No internet service':0})
train['OnlineBackup'] = train['OnlineBackup'].map({'Yes': 2, 'No': 1, 'No internet service':0})
train['DeviceProtection'] = train['DeviceProtection'].map({'Yes': 2, 'No': 1, 'No internet service':0})
train['TechSupport'] = train['TechSupport'].map({'Yes': 2, 'No': 1, 'No internet service':0})
train['StreamingTV'] = train['StreamingTV'].map({'Yes': 2, 'No': 1, 'No internet service':0})
train['StreamingMovies'] = train['StreamingMovies'].map({'Yes': 2, 'No': 1, 'No internet service':0})
train['Contract'] = train['Contract'].map({'Two year': 2, 'One year': 1, 'Month-to-month':0})
train['PaperlessBilling'] = train['PaperlessBilling'].map({'Yes': 1, 'No':0})
train['PaymentMethod'] = train['PaymentMethod'].map({'Bank transfer (automatic)': 3, 'Electronic check': 2, 'Credit card (automatic)':1, 'Mailed check':0})

In [51]:
train.drop(['Id','customerID'],1,inplace=True)
print(train.info())
train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5634 entries, 0 to 5633
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            5634 non-null   int64  
 1   SeniorCitizen     5634 non-null   int64  
 2   Partner           5634 non-null   int64  
 3   Dependents        5634 non-null   int64  
 4   tenure            5634 non-null   int64  
 5   PhoneService      5634 non-null   int64  
 6   MultipleLines     5634 non-null   int64  
 7   InternetService   5634 non-null   int64  
 8   OnlineSecurity    5634 non-null   int64  
 9   OnlineBackup      5634 non-null   int64  
 10  DeviceProtection  5634 non-null   int64  
 11  TechSupport       5634 non-null   int64  
 12  StreamingTV       5634 non-null   int64  
 13  StreamingMovies   5634 non-null   int64  
 14  Contract          5634 non-null   int64  
 15  PaperlessBilling  5634 non-null   int64  
 16  PaymentMethod     5634 non-null   int64  


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,0,0,0,27,1,1,0,0,0,0,0,0,0,2,1,3,20.25,538.2,No
1,1,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,2,19.75,19.75,No
2,0,0,0,0,17,1,2,2,1,1,2,1,1,1,0,0,3,80.05,1345.65,No
3,0,1,0,0,42,1,2,2,1,1,1,1,2,1,0,1,2,84.65,3541.35,Yes
4,1,0,1,0,23,1,1,1,2,1,1,1,1,2,0,1,3,59.95,1406.0,No


### Visualize the data

- Replace the missing values and modify some column values(as required by you).
- Check out the best plots for plotting between categorical target and continuous features and try making some inferences from these plots.
- Clean the data, apply some data preprocessing and engineering techniques.

In [52]:
train['TotalCharges'] = pd.to_numeric(train['TotalCharges'], errors='coerce')

In [53]:
train= train.replace(np.nan, 0, regex=True)

In [56]:
# Code starts here
corr_matrix= train.corr()
for column in corr_matrix.columns.tolist():
    for row in corr_matrix.index.tolist():
        if((abs(corr_matrix.loc[row,column])>.80) & (abs(corr_matrix.loc[row,column])<1)):
            print("{} and {}- {}".format(row,column,corr_matrix.loc[row,column]))

TotalCharges and tenure- 0.8244028030570243
MonthlyCharges and InternetService- 0.9069715817038673
StreamingMovies and StreamingTV- 0.807485505004253
MonthlyCharges and StreamingTV- 0.8172542239306935
StreamingTV and StreamingMovies- 0.807485505004253
MonthlyCharges and StreamingMovies- 0.822137993502303
InternetService and MonthlyCharges- 0.9069715817038673
StreamingTV and MonthlyCharges- 0.8172542239306935
StreamingMovies and MonthlyCharges- 0.822137993502303
tenure and TotalCharges- 0.8244028030570243


### Model building

- Try to predict the churning of customers using AdaBoost
- Try and implement XGBoost for our customer churn problem and see how it performs in comparision to AdaBoost. Use different techniques you have learned to imporove the performance of the model.
- Try improving upon the `accuracy_score` ([Accuracy Score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html))

In [57]:
# Code Starts here
X = train.drop('Churn',1)
y = train['Churn']

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 0, test_size = 0.3)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

clf = RandomForestClassifier(min_samples_split = 12,random_state = 0, criterion = 'entropy')

clf.fit(X_train, y_train)
predicted = clf.predict(X_test)
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

# Code ends here

0.9010905401978189
0.7936132465996452


In [58]:
pd.DataFrame(predicted)[0].value_counts()

No     1309
Yes     382
Name: 0, dtype: int64

In [59]:
accuracy_score(y_test, predicted)

0.7936132465996452

### Prediction on the test data and creating the sample submission file.

- Load the test data and store the `Id` column in a separate variable.
- Perform the same operations on the test data that you have performed on the train data.
- Create the submission file as a `csv` file consisting of the `Id` column from the test data and your prediction as the second column.

In [60]:
test = pd.read_csv('test.csv')
Id = test['Id'].copy()
test.drop(['Id','customerID'], 1, inplace = True)

In [61]:
test['gender'] = test['gender'].map({'Male': 1, 'Female': 0})
test['Partner'] = test['Partner'].map({'Yes': 1, 'No': 0})
test['Dependents'] = test['Dependents'].map({'Yes': 1, 'No': 0})
test['PhoneService'] = test['PhoneService'].map({'Yes': 1, 'No': 0})
test['MultipleLines'] = test['MultipleLines'].map({'Yes': 2, 'No': 1, 'No phone service':0})
test['InternetService'] = test['InternetService'].map({'Fiber optic': 2, 'DSL': 1, 'No':0})
test['OnlineSecurity'] = test['OnlineSecurity'].map({'Yes': 2, 'No': 1, 'No internet service':0})
test['OnlineBackup'] = test['OnlineBackup'].map({'Yes': 2, 'No': 1, 'No internet service':0})
test['DeviceProtection'] = test['DeviceProtection'].map({'Yes': 2, 'No': 1, 'No internet service':0})
test['TechSupport'] = test['TechSupport'].map({'Yes': 2, 'No': 1, 'No internet service':0})
test['StreamingTV'] = test['StreamingTV'].map({'Yes': 2, 'No': 1, 'No internet service':0})
test['StreamingMovies'] = test['StreamingMovies'].map({'Yes': 2, 'No': 1, 'No internet service':0})
test['Contract'] = test['Contract'].map({'Two year': 2, 'One year': 1, 'Month-to-month':0})
test['PaperlessBilling'] = test['PaperlessBilling'].map({'Yes': 1, 'No':0})
test['PaymentMethod'] = test['PaymentMethod'].map({'Bank transfer (automatic)': 3, 'Electronic check': 2, 'Credit card (automatic)':1, 'Mailed check':0})

In [63]:
print(test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1409 entries, 0 to 1408
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            1409 non-null   int64  
 1   SeniorCitizen     1409 non-null   int64  
 2   Partner           1409 non-null   int64  
 3   Dependents        1409 non-null   int64  
 4   tenure            1409 non-null   int64  
 5   PhoneService      1409 non-null   int64  
 6   MultipleLines     1409 non-null   int64  
 7   InternetService   1409 non-null   int64  
 8   OnlineSecurity    1409 non-null   int64  
 9   OnlineBackup      1409 non-null   int64  
 10  DeviceProtection  1409 non-null   int64  
 11  TechSupport       1409 non-null   int64  
 12  StreamingTV       1409 non-null   int64  
 13  StreamingMovies   1409 non-null   int64  
 14  Contract          1409 non-null   int64  
 15  PaperlessBilling  1409 non-null   int64  
 16  PaymentMethod     1409 non-null   int64  


In [64]:
test['TotalCharges'] = pd.to_numeric(test['TotalCharges'], errors='coerce')
test= test.replace(np.nan, 0, regex=True)

In [65]:
test = scaler.transform(test)
pred = clf.predict(test)

In [67]:
submission = pd.DataFrame({'Id': Id, 'Churn': pred})
submission

Unnamed: 0,Id,Churn
0,4539,No
1,1802,No
2,1380,No
3,5305,No
4,1960,No
...,...,...
1404,1416,No
1405,2115,No
1406,1078,No
1407,756,No


In [68]:
submission.to_csv('first_submission.csv', index = False)