In [1]:
import pandas as pd
from pandas_profiling import ProfileReport
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier
import xgboost as xgb
#from xgboost import cv
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import chart_studio.plotly as py
from plotly import tools
import plotly.figure_factory as ff
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
init_notebook_mode(connected=True)

In [2]:
df = pd.read_csv('data.csv')

In [3]:
#profile = ProfileReport(df, title='Pandas Profiling Report', html={'style':{'full_width':True}})

In [4]:
#profile

In [5]:
#profile.to_notebook_iframe()

In [6]:
#Remove columns that are not needed
df= df.drop(columns=['custID'])

#Cleaning the TotalFees Column so the model can read it
df['TotalFees'].dtypes
df['TotalFees'] = df['TotalFees'].str.replace('Â£', '')
df['TotalFees'] = df['TotalFees'].str.replace('£', '')
df.TotalFees = df.TotalFees.str.strip()
df['TotalFees'] = pd.to_numeric(df['TotalFees'])

In [7]:
#replace 'No internet service' to No for the following columns
replace_cols = [ 'HasAntivirusSubscription', 'HasCloudBackUp', 'HasInsurance',
                'HasHelpdeskPrivileges','HasTVPackage', 'HasMoviePackage']

for i in replace_cols : 
    df[i]  = df[i].replace({'No internet service' : 'No'})
    




In [8]:
df['TotalMonthsInContract'] = pd.qcut(df['TotalMonthsInContract'], q=10).astype(str)
df['MonthlyBill'] = pd.qcut(df['MonthlyBill'], q=10).astype(str)
df['TotalFees'] = pd.qcut(df['TotalFees'], q=20).astype(str)



In [9]:
#Let's visualize the retention
def bar_plot(col,data,barmode='group',width=800,height=600,color1='lightblue',color2='blue'):
    values = list(data[col].value_counts().keys())
    if values ==[0,1]:
        data[col].replace(0,'No',inplace=True)
        data[col].replace(1,'Yes',inplace=True)
        values = list(data[col].value_counts().keys())
    tr1 = data[data[col]==values[0]]['Retention'].value_counts().to_dict()
    tr2 = data[data[col]==values[1]]['Retention'].value_counts().to_dict()
    xx = ['Male', 'Female']
    trace1 = go.Bar(y=[tr1['No'], tr2['No']], name="Not Retained", x=values, marker=dict(color=color1))
    trace2 = go.Bar(y=[tr1['Yes'], tr2['Yes']], name="Retained", x=values, marker=dict(color=color2))
    data = [trace1, trace2]
    layout = go.Layout(
        barmode=barmode,xaxis = dict(title=col),yaxis=dict(title='Count'),
    title='Effect of '+ col + ' on Retention',width=width,height=height)
    fig = go.Figure(data=data, layout=layout)
    iplot(fig)
bar_plot('HasHelpdeskPrivileges',df)

In [10]:
bar_plot('IsRetired',df,barmode='stack',width=600,height=400,color1='lightblue',color2='blue')


In [11]:
df_last = df.iloc[:,18:]
df = df.iloc[:, :-1]
df_num = pd.get_dummies(df)

        
df_last['Retention'] = pd.Categorical(df_last['Retention'])
df_last['Retention'] = df_last['Retention'].cat.codes
df_last = df_last.replace({0:1, 1:0})
df_last


Unnamed: 0,Retention
0,0
1,0
2,1
3,0
4,1
...,...
7038,0
7039,0
7040,0
7041,1


In [12]:
df_num['Retention'] = df_last


In [13]:
X = df_num.values[:,0:75]
Y = df_num.values[:,75:]
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.3, shuffle=True)


In [14]:
xgbC = XGBClassifier()
rfe = RFE(xgbC, 20)
rfe = rfe.fit(X_train, y_train)
print(rfe.support_)
print(rfe.ranking_)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



KeyError: 'weight'

In [None]:
#Creating Lists for Columns and their Rankings
cols = list(df_num.columns.values)
cols.remove('Retention')
rfe_rank = list(rfe.ranking_)

#Creating a Dataframe with the Columns and their rankings
df_rank = pd.DataFrame({'Columns': cols , 'Ranking': rfe_rank})
df_rank = df_rank.sort_values(by='Ranking')
new_df = df_rank.head(20)
columns = list(new_df['Columns'])
columns 

In [None]:
df = df_num[columns]
df['Retention'] = df_last
sns.set(rc={'axes.facecolor':'white', 'figure.facecolor':'white'})
fig, ax = plt.subplots(figsize=(10,10))  
sns.heatmap(df.corr(),square=True,cmap="YlGnBu")

In [None]:
X_fin = df.values[:,0:20]
Y_fin = df.values[:,20:]
X_train, X_test, y_train, y_test = train_test_split(X_fin,Y_fin,test_size=0.3, shuffle=True)
cv = KFold(n_splits=5, random_state=7)
for train_index, test_index in cv.split(X):
    print("Train Index: ", train_index, "\n")
    print("Test Index: ", test_index)

    X_train, X_test, y_train, y_test = X[train_index], X[test_index], Y[train_index], Y[test_index]


In [None]:
xgb_c = XGBClassifier()

xgb_c.fit(X_train,y_train)

In [None]:
y_pred = xgb_c.predict(X_test)

In [None]:
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
print('Precision:', precision)
print('Recall:', recall)
print('Accuracy:', acc)
print('Confusion Matrix:', cm)

In [None]:
def plot_confusion_matrix(cm,
                          target_names,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True):
    import itertools

    accuracy = np.trace(cm) / float(np.sum(cm))
    misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]


    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")


    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
    plt.show()
    
plot_confusion_matrix(cm, normalize=False, target_names = ['No','Yes'], title='Confusion matrix')

In [None]:
xgb_we = XGBClassifier(scale_pos_weight=4)
xgb_we.fit(X_train,y_train.ravel())

In [None]:
y_pred_w = xgb_we.predict(X_test)
precision_w = precision_score(y_test, y_pred_w)
recall_w = recall_score(y_test, y_pred_w)
acc_w = accuracy_score(y_test, y_pred_w)
cm_w = confusion_matrix(y_test, y_pred_w)
print('Precision:', precision_w)
print('Recall:', recall_w)
print('Accuracy:', acc_w)
print('Confusion Matrix:', cm_w)

In [None]:
plot_confusion_matrix(cm_w, normalize=False, target_names = ['No','Yes'], title='Confusion matrix')

In [None]:
import scikitplot as skplt
# Deriving Class probabilities
predicted_probabilities = xgb_we.predict_proba(X_test)
# Creating the plot
skplt.metrics.plot_cumulative_gain(y_test, predicted_probabilities)

In [15]:
skplt.metrics.plot_lift_curve(y_test, predicted_probabilities)


NameError: name 'skplt' is not defined