In [1]:
!pip install scikit-learn==1.5.1



In [15]:
import numpy as np
import pandas as pd
import os
import io
import itertools

# Azure ML libraries
from azureml.core import Experiment, Workspace, Dataset

# Data preprocessing libraries
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff
import plotly.io as pio
from PIL import Image

%matplotlib inline
py.init_notebook_mode(connected=True)

In [16]:
# Load workspace
ws = Workspace.from_config()

print(f"Workspace name: {ws.name}")
print(f"Azure region: {ws.location}")
print(f"Subscription id: {ws.subscription_id}")
print(f"Resource group: {ws.resource_group}")

Workspace name: customer-churn-prediction
Azure region: polandcentral
Subscription id: b4245f1f-9c10-4efb-98d5-791703458cf9
Resource group: rg-ml-customerchurn-polandcentral


In [19]:
# Create experiment
experiment = Experiment(workspace = ws, name = "churn-EDA")
# Object to log data in the experiment
run = experiment.start_logging()
print("Starting experiment:", experiment.name)

Starting experiment: churn-EDA


In [11]:
df = pd.read_csv('CustomerChurn.csv')

In [21]:
df.head()

Unnamed: 0,LoyaltyID,Customer ID,Senior Citizen,Partner,Dependents,Tenure,Phone Service,Multiple Lines,Internet Service,Online Security,...,Device Protection,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn
0,318537,7590-VHVEG,No,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,152148,5575-GNVDE,No,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,326527,3668-QPYBK,No,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,845894,7795-CFOCW,No,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,503388,9237-HQITU,No,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [23]:
df.describe()

Unnamed: 0,LoyaltyID,Tenure,Monthly Charges
count,7043.0,7043.0,7043.0
mean,550382.651001,32.371149,64.761692
std,260776.11869,24.559481,30.090047
min,100346.0,0.0,18.25
25%,323604.5,9.0,35.5
50%,548704.0,29.0,70.35
75%,776869.0,55.0,89.85
max,999912.0,72.0,118.75


In [24]:
df.shape

(7043, 21)

In [25]:
df.isnull().sum()

LoyaltyID            0
Customer ID          0
Senior Citizen       0
Partner              0
Dependents           0
Tenure               0
Phone Service        0
Multiple Lines       0
Internet Service     0
Online Security      0
Online Backup        0
Device Protection    0
Tech Support         0
Streaming TV         0
Streaming Movies     0
Contract             0
Paperless Billing    0
Payment Method       0
Monthly Charges      0
Total Charges        0
Churn                0
dtype: int64

In [27]:
unique_values = df.nunique()

for index in range(len(unique_values)):
    print ("\nUnique Values For Column", unique_values.keys()[index], ":", unique_values.values[index])


Unique Values For Column LoyaltyID : 7021

Unique Values For Column Customer ID : 7043

Unique Values For Column Senior Citizen : 2

Unique Values For Column Partner : 2

Unique Values For Column Dependents : 2

Unique Values For Column Tenure : 73

Unique Values For Column Phone Service : 2

Unique Values For Column Multiple Lines : 3

Unique Values For Column Internet Service : 3

Unique Values For Column Online Security : 3

Unique Values For Column Online Backup : 3

Unique Values For Column Device Protection : 3

Unique Values For Column Tech Support : 3

Unique Values For Column Streaming TV : 3

Unique Values For Column Streaming Movies : 3

Unique Values For Column Contract : 3

Unique Values For Column Paperless Billing : 2

Unique Values For Column Payment Method : 4

Unique Values For Column Monthly Charges : 1585

Unique Values For Column Total Charges : 6531

Unique Values For Column Churn : 2


In [45]:
df['Paperless Billing'].unique()

array(['Yes', 'No'], dtype=object)

In [33]:
df['Total Charges'] = df['Total Charges'].replace(" ",np.nan)

In [56]:
df.isnull().sum()

LoyaltyID             0
Customer ID           0
Senior Citizen        0
Partner               0
Dependents            0
Tenure                0
Phone Service         0
Multiple Lines        0
Internet Service      0
Online Security       0
Online Backup         0
Device Protection     0
Tech Support          0
Streaming TV          0
Streaming Movies      0
Contract              0
Paperless Billing     0
Payment Method        0
Monthly Charges       0
Total Charges        11
Churn                 0
dtype: int64

In [58]:
df = df.dropna(subset=['Total Charges'])

In [64]:
df["Total Charges"] = df["Total Charges"].astype(float)

In [62]:
replace_cols = ["Online Security", "Online Backup", "Device Protection",
                "Tech Support","Streaming TV", "Streaming Movies"]
for i in replace_cols : 
    df[i]  = df[i].replace({"No internet service" : "No"})

In [65]:
df["Senior Citizen"] = df["Senior Citizen"].replace({1:"Yes",0:"No"})

In [66]:
df['Senior Citizen'].unique()

array(['No', 'Yes'], dtype=object)

In [67]:
run.complete()