In [227]:
!pip install scikit-learn==1.5.1



In [228]:
!pip install kaleido==1.0.0
!pip install plotly==6.1.1



In [229]:
import numpy as np
import pandas as pd
import os
import io
import itertools

# Azure ML libraries
from azureml.core import Experiment, Workspace, Dataset

# Data preprocessing libraries
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff
import plotly.io as pio
from PIL import Image

%matplotlib inline
py.init_notebook_mode(connected=True)

In [230]:
# Load workspace
ws = Workspace.from_config()

print(f"Workspace name: {ws.name}")
print(f"Azure region: {ws.location}")
print(f"Subscription id: {ws.subscription_id}")
print(f"Resource group: {ws.resource_group}")

Workspace name: customer-churn-prediction
Azure region: polandcentral
Subscription id: b4245f1f-9c10-4efb-98d5-791703458cf9
Resource group: rg-ml-customerchurn-polandcentral


In [231]:
# Create experiment
experiment = Experiment(workspace = ws, name = "churn-EDA")
# Object to log data in the experiment
run = experiment.start_logging()
print("Starting experiment:", experiment.name)

Starting experiment: churn-EDA


In [232]:
df = pd.read_csv('CustomerChurn.csv')

In [233]:
df.head()

Unnamed: 0,LoyaltyID,Customer ID,Senior Citizen,Partner,Dependents,Tenure,Phone Service,Multiple Lines,Internet Service,Online Security,...,Device Protection,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn
0,318537,7590-VHVEG,No,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,152148,5575-GNVDE,No,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,326527,3668-QPYBK,No,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,845894,7795-CFOCW,No,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,503388,9237-HQITU,No,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [234]:
df.describe()

Unnamed: 0,LoyaltyID,Tenure,Monthly Charges
count,7043.0,7043.0,7043.0
mean,550382.651001,32.371149,64.761692
std,260776.11869,24.559481,30.090047
min,100346.0,0.0,18.25
25%,323604.5,9.0,35.5
50%,548704.0,29.0,70.35
75%,776869.0,55.0,89.85
max,999912.0,72.0,118.75


In [235]:
df.shape

(7043, 21)

In [236]:
df.isnull().sum()

LoyaltyID            0
Customer ID          0
Senior Citizen       0
Partner              0
Dependents           0
Tenure               0
Phone Service        0
Multiple Lines       0
Internet Service     0
Online Security      0
Online Backup        0
Device Protection    0
Tech Support         0
Streaming TV         0
Streaming Movies     0
Contract             0
Paperless Billing    0
Payment Method       0
Monthly Charges      0
Total Charges        0
Churn                0
dtype: int64

In [237]:
unique_values = df.nunique()

for index in range(len(unique_values)):
    print ("\nUnique Values For Column", unique_values.keys()[index], ":", unique_values.values[index])


Unique Values For Column LoyaltyID : 7021

Unique Values For Column Customer ID : 7043

Unique Values For Column Senior Citizen : 2

Unique Values For Column Partner : 2

Unique Values For Column Dependents : 2

Unique Values For Column Tenure : 73

Unique Values For Column Phone Service : 2

Unique Values For Column Multiple Lines : 3

Unique Values For Column Internet Service : 3

Unique Values For Column Online Security : 3

Unique Values For Column Online Backup : 3

Unique Values For Column Device Protection : 3

Unique Values For Column Tech Support : 3

Unique Values For Column Streaming TV : 3

Unique Values For Column Streaming Movies : 3

Unique Values For Column Contract : 3

Unique Values For Column Paperless Billing : 2

Unique Values For Column Payment Method : 4

Unique Values For Column Monthly Charges : 1585

Unique Values For Column Total Charges : 6531

Unique Values For Column Churn : 2


In [238]:
df['Paperless Billing'].unique()

array(['Yes', 'No'], dtype=object)

In [239]:
df['Total Charges'] = df['Total Charges'].replace(" ",np.nan)

In [240]:
df.isnull().sum()

LoyaltyID             0
Customer ID           0
Senior Citizen        0
Partner               0
Dependents            0
Tenure                0
Phone Service         0
Multiple Lines        0
Internet Service      0
Online Security       0
Online Backup         0
Device Protection     0
Tech Support          0
Streaming TV          0
Streaming Movies      0
Contract              0
Paperless Billing     0
Payment Method        0
Monthly Charges       0
Total Charges        11
Churn                 0
dtype: int64

In [241]:
df = df.dropna(subset=['Total Charges'])

In [242]:
df["Total Charges"] = df["Total Charges"].astype(float)

In [243]:
replace_cols = ["Online Security", "Online Backup", "Device Protection",
                "Tech Support","Streaming TV", "Streaming Movies"]
for i in replace_cols : 
    df[i]  = df[i].replace({"No internet service" : "No"})

In [244]:
df['Senior Citizen'].unique()

array(['No', 'Yes'], dtype=object)

In [245]:
df.head()

Unnamed: 0,LoyaltyID,Customer ID,Senior Citizen,Partner,Dependents,Tenure,Phone Service,Multiple Lines,Internet Service,Online Security,...,Device Protection,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn
0,318537,7590-VHVEG,No,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,152148,5575-GNVDE,No,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,326527,3668-QPYBK,No,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,845894,7795-CFOCW,No,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,503388,9237-HQITU,No,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [246]:
# Creating categorical column based on 'Tenure' - grouping values into buckets.

def tenure_lab(df) :    
    if df['Tenure'] <= 12 :
        return 'Tenure_0-12'
    elif (df['Tenure'] > 12) & (df['Tenure'] <= 24 ):
        return 'Tenure_12-24'
    elif (df['Tenure'] > 24) & (df['Tenure'] <= 48) :
        return 'Tenure_24-48'
    elif (df['Tenure'] > 48) & (df['Tenure'] <= 60) :
        return 'Tenure_48-60'
    elif df['Tenure'] > 60 :
        return 'Tenure_gt_60'
    
df["tenure_group"] = df.apply(lambda df:tenure_lab(df), axis = 1)

In [247]:
df.head()

Unnamed: 0,LoyaltyID,Customer ID,Senior Citizen,Partner,Dependents,Tenure,Phone Service,Multiple Lines,Internet Service,Online Security,...,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn,tenure_group
0,318537,7590-VHVEG,No,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No,Tenure_0-12
1,152148,5575-GNVDE,No,No,No,34,Yes,No,DSL,Yes,...,No,No,No,One year,No,Mailed check,56.95,1889.5,No,Tenure_24-48
2,326527,3668-QPYBK,No,No,No,2,Yes,No,DSL,Yes,...,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,Tenure_0-12
3,845894,7795-CFOCW,No,No,No,45,No,No phone service,DSL,Yes,...,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No,Tenure_24-48
4,503388,9237-HQITU,No,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,Tenure_0-12


In [248]:
churn = df[df["Churn"] == "Yes"]
not_churn = df[df["Churn"] == "No"]

In [249]:
Id_col = ["Customer ID"]
loyalty_col = ["LoyaltyID"]
target_col = ["Churn"]
cat_cols = df.nunique()[df.nunique() < 6].keys().tolist()
cat_cols = [x for x in cat_cols if x not in target_col]
num_cols = [x for x in df.columns if x not in cat_cols + target_col + Id_col + loyalty_col]

In [250]:
num_cols

['Tenure', 'Monthly Charges', 'Total Charges']

In [251]:
cat_cols

['Senior Citizen',
 'Partner',
 'Dependents',
 'Phone Service',
 'Multiple Lines',
 'Internet Service',
 'Online Security',
 'Online Backup',
 'Device Protection',
 'Tech Support',
 'Streaming TV',
 'Streaming Movies',
 'Contract',
 'Paperless Billing',
 'Payment Method',
 'tenure_group']

In [252]:
# Preparing Labels and Values
lab = df["Churn"].value_counts().keys().tolist()
val = df["Churn"].value_counts().values.tolist()

trace = go.Pie(
    labels=lab,
    values=val,
    marker=dict(
        colors=['#636EFA', '#EF553B'],
        line=dict(color="white", width=1.3)
    ),
    rotation=90,
    hoverinfo="label+value+text",
    hole=0.5
)

layout = go.Layout(
    dict(
        title=dict(
            text="Customer Churn Proportion",
            x=0.5,
            xanchor='center'
        ),
        plot_bgcolor="rgb(243,243,243)",
        paper_bgcolor="rgb(243,243,243)",
    )
)

data = [trace]
fig = go.Figure(data=data, layout=layout)

image_path = "outputs/churn_proportion.png"
fig.write_image(image_path)

run.upload_file(name=image_path, path_or_stream=image_path)

py.iplot(fig)

In [253]:
# Function for pie plot for customer churn types
def plot_pie(column) :
    
    trace1 = go.Pie(values  = churn[column].value_counts().values.tolist(),
                    labels  = churn[column].value_counts().keys().tolist(),
                    hoverinfo = "label+percent+name",
                    domain  = dict(x = [0,.48]),
                    name    = "Churn Customers",
                    marker  = dict(line = dict(width = 2,
                                               color = "rgb(243,243,243)")
                                  ),
                    hole    = .6
                   )
    trace2 = go.Pie(values  = not_churn[column].value_counts().values.tolist(),
                    labels  = not_churn[column].value_counts().keys().tolist(),
                    hoverinfo = "label+percent+name",
                    marker  = dict(line = dict(width = 2,
                                               color = "rgb(243,243,243)")
                                  ),
                    domain  = dict(x = [.52,1]),
                    hole    = .6,
                    name    = "Non Churn Customers" 
                   )


    layout = go.Layout(dict(title = column + " Distribution in Customer Churn",
                            plot_bgcolor  = "rgb(243,243,243)",
                            paper_bgcolor = "rgb(243,243,243)",
                            annotations = [dict(text = "Churn Customers",
                                                font = dict(size = 13),
                                                showarrow = False,
                                                x = .15, y = .5),
                                           dict(text = "Non Churn Customers",
                                                font = dict(size = 13),
                                                showarrow = False,
                                                x = .88,y = .5
                                               )
                                          ]
                           )
                      )
    data = [trace1,trace2]
    fig  = go.Figure(data = data,layout = layout)
    
    image_path = "outputs/pie_plot_" + column + ".png"
    fig.write_image(image_path)
    
    # Upload the file explicitly into artifacts 
    run.upload_file(name = image_path, path_or_stream = image_path)
    
    py.iplot(fig)


# Function for histogram for customer churn types
def histogram(column) :
    trace1 = go.Histogram(x  = churn[column],
                          histnorm= "percent",
                          name = "Churn Customers",
                          marker = dict(line = dict(width = .5,
                                                    color = "black"
                                                    )
                                        ),
                         opacity = .9 
                         ) 
    
    trace2 = go.Histogram(x  = not_churn[column],
                          histnorm = "percent",
                          name = "Non Churn Customers",
                          marker = dict(line = dict(width = .5,
                                              color = "black"
                                             )
                                 ),
                          opacity = .9
                         )
    
    data = [trace1,trace2]
    layout = go.Layout(dict(title =column + " Distribution in Customer Churn ",
                            plot_bgcolor  = "rgb(243,243,243)",
                            paper_bgcolor = "rgb(243,243,243)",
                            xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                             title = column,
                                             zerolinewidth=1,
                                             ticklen=5,
                                             gridwidth=2
                                            ),
                            yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                             title = "percent",
                                             zerolinewidth=1,
                                             ticklen=5,
                                             gridwidth=2
                                            ),
                           )
                      )
    fig  = go.Figure(data=data,layout=layout)
    
    image_path = "outputs/histogram_" + column + ".png"
    fig.write_image(image_path)
    
    # Upload the file explicitly into artifacts 
    run.upload_file(name = image_path, path_or_stream = image_path)
    
    py.iplot(fig)
    
    
# Function for scatter plot matrix for numerical columns in data
def scatter_matrix(df)  :
    
    df  = df.sort_values(by = "Churn" ,ascending = True)
    classes = df["Churn"].unique().tolist()
    classes
    
    class_code  = {classes[k] : k for k in range(2)}
    class_code

    color_vals = [class_code[cl] for cl in df["Churn"]]
    color_vals

    pl_colorscale = "Portland"

    pl_colorscale

    text = [df.iloc[k]["Churn"] for k in range(len(df))]
    text

    trace = go.Splom(dimensions = [dict(label  = "Tenure",
                                       values = df["Tenure"]),
                                  dict(label  = 'Monthly Charges',
                                       values = df['Monthly Charges']),
                                  dict(label  = 'Total Charges',
                                       values = df['Total Charges'])],
                     text = text,
                     marker = dict(color = color_vals,
                                   colorscale = pl_colorscale,
                                   size = 3,
                                   showscale = False,
                                   line = dict(width = .1,
                                               color='rgb(230,230,230)'
                                              )
                                  )
                    )
    axis = dict(showline  = True,
                zeroline  = False,
                gridcolor = "#fff",
                ticklen   = 4
               )
    
    layout = go.Layout(dict(title  = 
                            "Scatter Plot Matrix for Numerical Columns for Customer Churn",
                            autosize = False,
                            height = 800,
                            width  = 800,
                            dragmode = "select",
                            hovermode = "closest",
                            plot_bgcolor  = 'rgba(240,240,240, 0.95)',
                            xaxis1 = dict(axis),
                            yaxis1 = dict(axis),
                            xaxis2 = dict(axis),
                            yaxis2 = dict(axis),
                            xaxis3 = dict(axis),
                            yaxis3 = dict(axis),
                           )
                      )
    data   = [trace]
    fig = go.Figure(data = data,layout = layout )
    
    image_path = "outputs/scatter_matrix" + ".png"
    fig.write_image(image_path)
    
    # Upload the file explicitly into artifacts 
    run.upload_file(name = image_path, path_or_stream = image_path)

    py.iplot(fig)

# For all categorical columns plot a pie
for i in cat_cols :
    plot_pie(i)

# For all numerical columns plot a histogram    
for i in num_cols :
    histogram(i)

# Plot scatter plot matrix
scatter_matrix(df)

In [254]:
tg_ch  =  churn["tenure_group"].value_counts().reset_index()
tg_ch.columns  = ["tenure_group","count"]
tg_nch =  not_churn["tenure_group"].value_counts().reset_index()
tg_nch.columns = ["tenure_group","count"]

# Bar plot - Churn
trace1 = go.Bar(x = tg_ch["tenure_group"]  , y = tg_ch["count"],
                name = "Churn Customers",
                marker = dict(line = dict(width = .5,color = "black")),
                opacity = .9)

# Bar plot - Non Churn
trace2 = go.Bar(x = tg_nch["tenure_group"] , y = tg_nch["count"],
                name = "Non Churn Customers",
                marker = dict(line = dict(width = .5,color = "black")),
                opacity = .9)

layout = go.Layout(dict(title = "Customer attrition in tenure groups",
                        plot_bgcolor  = "rgb(243,243,243)",
                        paper_bgcolor = "rgb(243,243,243)",
                        xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                     title = "tenure group",
                                     zerolinewidth=1,ticklen=5,gridwidth=2),
                        yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                     title = "count",
                                     zerolinewidth=1,ticklen=5,gridwidth=2),
                       )
                  )
data = [trace1,trace2]
fig  = go.Figure(data=data,layout=layout)

image_path = "outputs/bar_churn_tenure_groups.png"
fig.write_image(image_path)

# Upload the file explicitly into artifacts 
run.upload_file(name = image_path, path_or_stream = image_path)

py.iplot(fig)

In [255]:
def plot_tenure_scatter(tenure_group,color) :
    tracer = go.Scatter(x = df[df["tenure_group"] == tenure_group]["Monthly Charges"],
                        y = df[df["tenure_group"] == tenure_group]["Total Charges"],
                        mode = "markers",marker = dict(line = dict(color = "black",
                                                                   width = .2),
                                                       size = 4 , color = color,
                                                       symbol = "diamond-dot",
                                                      ),
                        name = tenure_group,
                        opacity = .9
                       )
    return tracer

# Scatter plot monthly charges & total charges by churn group

def plot_churncharges_scatter(churn,color) :
    tracer = go.Scatter(x = df[df["Churn"] == churn]["Monthly Charges"],
                        y = df[df["Churn"] == churn]["Total Charges"],
                        mode = "markers",marker = dict(line = dict(color = "black",
                                                                   width = .2),
                                                       size = 4 , color = color,
                                                       symbol = "diamond-dot",
                                                      ),
                        name = "Churn - " + churn,
                        opacity = .9
                       )
    return tracer

trace1 = plot_tenure_scatter("Tenure_0-12","#FF3300")
trace2 = plot_tenure_scatter("Tenure_12-24","#6666FF")
trace3 = plot_tenure_scatter("Tenure_24-48","#99FF00")
trace4 = plot_tenure_scatter("Tenure_48-60","#996600")
trace5 = plot_tenure_scatter("Tenure_gt_60","grey")
trace6 = plot_churncharges_scatter("Yes","red")
trace7 = plot_churncharges_scatter("No","blue")

data1   = [trace1,trace2,trace3,trace4,trace5] 
data2   = [trace7,trace6]

# Layout
def layout_title(title) :
    layout = go.Layout(dict(title = title,
                            plot_bgcolor  = "rgb(243,243,243)",
                            paper_bgcolor = "rgb(243,243,243)",
                            xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                         title = "Monthly charges",
                                         zerolinewidth=1,ticklen=5,gridwidth=2),
                            yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                         title = "Total Charges",
                                         zerolinewidth=1,ticklen=5,gridwidth=2),
                            height = 600
                           )
                      )
    return layout

layout1  = layout_title("Monthly Charges & Total Charges by Tenure Group")
layout2  = layout_title("Monthly Charges & Total Charges by Churn Group")
fig1 = go.Figure(data = data1,layout = layout1)
fig2 = go.Figure(data = data2,layout = layout2)

image_path = "outputs/scatter_plot_tenure_group.png"
fig1.write_image(image_path)

# Upload the file explicitly into artifacts 
run.upload_file(name = image_path, path_or_stream = image_path)

py.iplot(fig1)

image_path = "outputs/scatter_plot_churn_group.png"
fig2.write_image(image_path)

# Upload the file explicitly into artifacts 
run.upload_file(name = image_path, path_or_stream = image_path)

py.iplot(fig2)

In [256]:
# Average charges by tenure groups
avg_tgc = df.groupby(["tenure_group","Churn"])[["Monthly Charges", "Total Charges"]].mean().reset_index()

#function for tracing 
def mean_charges(column,aggregate) :
    tracer = go.Bar(x = avg_tgc[avg_tgc["Churn"] == aggregate]["tenure_group"],
                    y = avg_tgc[avg_tgc["Churn"] == aggregate][column],
                    name = aggregate,marker = dict(line = dict(width = 1)),
                    text = "Churn"
                   )
    return tracer

#function for layout
def layout_plot(title,xaxis_lab,yaxis_lab) :
    layout = go.Layout(dict(title = title,
                            plot_bgcolor  = "rgb(243,243,243)",
                            paper_bgcolor = "rgb(243,243,243)",
                            xaxis = dict(gridcolor = 'rgb(255, 255, 255)',title = xaxis_lab,
                                         zerolinewidth=1,ticklen=5,gridwidth=2),
                            yaxis = dict(gridcolor = 'rgb(255, 255, 255)',title = yaxis_lab,
                                         zerolinewidth=1,ticklen=5,gridwidth=2),
                           )
                      )
    return layout
    

#plot1 - mean monthly charges by tenure groups
trace1  = mean_charges("Monthly Charges","Yes")
trace2  = mean_charges("Monthly Charges","No")
layout1 = layout_plot("Average Monthly Charges by Tenure Groups",
                      "Tenure group","Monthly Charges")
data1   = [trace1,trace2]
fig1    = go.Figure(data=data1,layout=layout1)

#plot2 - mean total charges by tenure groups
trace3  = mean_charges("Total Charges","Yes")
trace4  = mean_charges("Total Charges","No")
layout2 = layout_plot("Average Total Charges by Tenure Groups",
                      "Tenure group","Total Charges")
data2   = [trace3,trace4]
fig2    = go.Figure(data=data2,layout=layout2)

image_path = "outputs/bar_monthy_charges_tenure_group.png"
fig1.write_image(image_path)

# Upload the file explicitly into artifacts 
run.upload_file(name = image_path, path_or_stream = image_path)

py.iplot(fig1)

image_path = "outputs/bar_total_charges_tenure_group.png"
fig2.write_image(image_path)

# Upload the file explicitly into artifacts 
run.upload_file(name = image_path, path_or_stream = image_path)

py.iplot(fig2)

In [257]:
df_copy = df.copy()

# Drop tenure column
df_copy = df_copy.drop(columns = "tenure_group",axis = 1)

trace1 = go.Scatter3d(x = churn["Monthly Charges"],
                      y = churn["Total Charges"],
                      z = churn["Tenure"],
                      mode = "markers",
                      name = "Churn customers",
                      text = "Id : " + churn["Customer ID"],
                      marker = dict(size = 1,color = "red")
                     )

trace2 = go.Scatter3d(x = not_churn["Monthly Charges"],
                      y = not_churn["Total Charges"],
                      z = not_churn["Tenure"],
                      name = "Non churn customers",
                      text = "Id : " + not_churn["Customer ID"],
                      mode = "markers",
                      marker = dict(size = 1,color= "green")
                     )


layout = go.Layout(dict(title = "Monthly charges,total charges & tenure in customer attrition",
                        scene = dict(camera = dict(up=dict(x= 0 , y=0, z=0),
                                                   center=dict(x=0, y=0, z=0),
                                                   eye=dict(x=1.25, y=1.25, z=1.25)),
                                     xaxis  = dict(title = "monthly charges",
                                                   gridcolor='rgb(255, 255, 255)',
                                                   zerolinecolor='rgb(255, 255, 255)',
                                                   showbackground=True,
                                                   backgroundcolor='rgb(230, 230,230)'),
                                     yaxis  = dict(title = "total charges",
                                                   gridcolor='rgb(255, 255, 255)',
                                                   zerolinecolor='rgb(255, 255, 255)',
                                                   showbackground=True,
                                                   backgroundcolor='rgb(230, 230,230)'
                                                  ),
                                     zaxis  = dict(title = "tenure",
                                                   gridcolor='rgb(255, 255, 255)',
                                                   zerolinecolor='rgb(255, 255, 255)',
                                                   showbackground=True,
                                                   backgroundcolor='rgb(230, 230,230)'
                                                  )
                                    ),
                        height = 700,
                       )
                  )
                  
data = [trace1,trace2]
fig  = go.Figure(data = data,layout = layout)

image_path = "outputs/3D_monthly_charges_total_charges_tenure.png"
fig.write_image(image_path)

# Upload the file explicitly into artifacts 
run.upload_file(name = image_path, path_or_stream = image_path)

py.iplot(fig)

In [258]:
# Exclude ID column
Id_col     = ['Customer ID']

# Target column
target_col = ["Churn"]

# Categorical feature columns
cat_cols   = df.nunique()[df.nunique() < 6].keys().tolist() # get columns with less than 6 unique values
cat_cols   = [x for x in cat_cols if x not in target_col] # exclude target column (which is also categorical)

# Numerical feature columns
num_cols   = [x for x in df.columns if x not in cat_cols + target_col + Id_col]

# Binary columns with two values
bin_cols   = df.nunique()[df.nunique() == 2].keys().tolist()

# Columns with more than two values
multi_cols = [i for i in cat_cols if i not in bin_cols]

# Label encoding of binary columns
le = LabelEncoder()
for i in bin_cols :
    df[i] = le.fit_transform(df[i])

# Duplicating columns for multi value columns
df = pd.get_dummies(data = df,columns = multi_cols)

# Scaling of numerical columns
std = StandardScaler()
scaled = std.fit_transform(df[num_cols])
scaled = pd.DataFrame(scaled, columns=num_cols)

scaled.index = df.index

# Dropping original values and merging scaled values for numerical columns
df_og = df.copy()
df = df.drop(columns=num_cols, axis=1)
df = df.merge(scaled, left_index=True, right_index=True, how="left")

In [260]:
df_og = df.copy()

summary = (df_og[[i for i in df_og.columns if i not in Id_col]].
           describe().transpose().reset_index())

summary = summary.rename(columns = {"index" : "feature"})
summary = np.around(summary,3)

val_lst = [summary['feature'], summary['count'],
           summary['mean'],summary['std'],
           summary['min'], summary['25%'],
           summary['50%'], summary['75%'], summary['max']]

trace  = go.Table(header = dict(values = summary.columns.tolist(),
                                line = dict(color = ['#506784']),
                                fill = dict(color = ['#119DFF']),
                               ),
                  cells  = dict(values = val_lst,
                                line = dict(color = ['#506784']),
                                fill = dict(color = ["lightgrey",'#F5F8FF'])
                               ),
                  columnwidth = [200,60,100,100,60,60,80,80,80])
layout = go.Layout(dict(title = "Variable Summary"))
figure = go.Figure(data=[trace],layout=layout)

# fig.write_image("outputs/summary_variables.png")
py.iplot(figure)

In [261]:
correlation = df.corr()
# Determine tick labels
matrix_cols = correlation.columns.tolist()
# Convert to array
corr_array  = np.array(correlation)

# Plot
trace = go.Heatmap(z = corr_array,
                   x = matrix_cols,
                   y = matrix_cols,
                   colorscale = "Viridis",
                   colorbar   = dict(title = "Pearson Correlation coefficient",
                                    ) ,
                  )

layout = go.Layout(dict(title = "Correlation Matrix for variables",
                        autosize = False,
                        height  = 720,
                        width   = 800,
                        margin  = dict(r = 0 ,l = 210,
                                       t = 25,b = 210,
                                      ),
                        yaxis   = dict(tickfont = dict(size = 9)),
                        xaxis   = dict(tickfont = dict(size = 9))
                       )
                  )

data = [trace]
fig = go.Figure(data=data,layout=layout)

image_path = "outputs/correlation_matrix_variables.png"
fig.write_image(image_path)
# upload the file explicitly into artifacts 
run.upload_file(name = image_path, path_or_stream = image_path)
py.iplot(fig)





In [263]:
pca = PCA(n_components = 2)

X = df[[i for i in df.columns if i not in Id_col + target_col]]
Y = df[target_col + Id_col]

principal_components = pca.fit_transform(X)
pca_data = pd.DataFrame(principal_components,columns = ["PC1","PC2"])
pca_data = pca_data.merge(Y,left_index=True,right_index=True,how="left")
pca_data["Churn"] = pca_data["Churn"].replace({1:"Churn",0:"Not Churn"})

def pca_scatter(target,color) :
    tracer = go.Scatter(x = pca_data[pca_data["Churn"] == target]["PC1"] ,
                        y = pca_data[pca_data["Churn"] == target]["PC2"],
                        name = target,mode = "markers",
                        marker = dict(color = color,
                                      line = dict(width = .5),
                                      symbol =  "diamond-open"),
                        text = ("Customer ID : " + 
                                pca_data[pca_data["Churn"] == target]['Customer ID'])
                       )
    return tracer

layout = go.Layout(dict(title = "Visualising data with principal components",
                        plot_bgcolor  = "rgb(243,243,243)",
                        paper_bgcolor = "rgb(243,243,243)",
                        xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                     title = "principal component 1",
                                     zerolinewidth=1,ticklen=5,gridwidth=2),
                        yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                     title = "principal component 2",
                                     zerolinewidth=1,ticklen=5,gridwidth=2),
                        height = 600
                       )
                  )
trace1 = pca_scatter("Churn",'red')
trace2 = pca_scatter("Not Churn",'royalblue')
data = [trace2,trace1]
fig = go.Figure(data=data,layout=layout)

image_path = "outputs/pca.png"
fig.write_image(image_path)
# upload the file explicitly into artifacts 
run.upload_file(name = image_path, path_or_stream = image_path)
py.iplot(fig)

In [264]:
bi_cs = df.nunique()[df.nunique() == 2].keys()
dat_rad = df[bi_cs]

# Plot radar chart for churn and non churn customers (binary variables)
def plot_radar(df,aggregate,title) :
    data_frame = df[df["Churn"] == aggregate] 
    data_frame_x = data_frame[bi_cs].sum().reset_index()
    data_frame_x.columns = ["feature","yes"]
    data_frame_x["no"] = data_frame.shape[0] - data_frame_x["yes"]
    data_frame_x = data_frame_x[data_frame_x["feature"] != "Churn"]
    
    #count of 1's(yes)
    trace1 = go.Scatterpolar(r = data_frame_x["yes"].values.tolist(),
                             theta = data_frame_x["feature"].tolist(),
                             fill  = "toself",name = "count of 1's",
                             mode = "markers+lines",
                             marker = dict(size = 5)
                            )
    #count of 0's(No)
    trace2 = go.Scatterpolar(r = data_frame_x["no"].values.tolist(),
                             theta = data_frame_x["feature"].tolist(),
                             fill  = "toself",name = "count of 0's",
                             mode = "markers+lines",
                             marker = dict(size = 5)
                            ) 
    layout = go.Layout(dict(polar = dict(radialaxis = dict(visible = True,
                                                           side = "counterclockwise",
                                                           showline = True,
                                                           linewidth = 2,
                                                           tickwidth = 2,
                                                           gridcolor = "white",
                                                           gridwidth = 2),
                                         angularaxis = dict(tickfont = dict(size = 10),
                                                            layer = "below traces"
                                                           ),
                                         bgcolor  = "rgb(243,243,243)",
                                        ),
                            paper_bgcolor = "rgb(243,243,243)",
                            title = title,height = 700))
    
    data = [trace2,trace1]
    fig = go.Figure(data=data,layout=layout)
    
    image_path = "outputs/radar_" + title + ".png"
    fig.write_image(image_path)
    # upload the file explicitly into artifacts 
    run.upload_file(name = image_path, path_or_stream = image_path)
    py.iplot(fig)

#plot
plot_radar(dat_rad,1,"Churn -  Customers")
plot_radar(dat_rad,0,"Non Churn - Customers")