In [112]:
import pandas as pd
from pandas.api.types import CategoricalDtype
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

In [113]:
# Grabbing data from the csv file and telling Pandas that the data already contains an index column.
cd = pd.read_csv("C:/Users/cklni/Desktop/WGU/D209/churn_clean.csv", index_col= [0])

In [114]:
# checking for columns with null values, as I know that each column should have 10,000 non null
cd.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 1 to 10000
Data columns (total 49 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Customer_id           10000 non-null  object 
 1   Interaction           10000 non-null  object 
 2   UID                   10000 non-null  object 
 3   City                  10000 non-null  object 
 4   State                 10000 non-null  object 
 5   County                10000 non-null  object 
 6   Zip                   10000 non-null  int64  
 7   Lat                   10000 non-null  float64
 8   Lng                   10000 non-null  float64
 9   Population            10000 non-null  int64  
 10  Area                  10000 non-null  object 
 11  TimeZone              10000 non-null  object 
 12  Job                   10000 non-null  object 
 13  Children              10000 non-null  int64  
 14  Age                   10000 non-null  int64  
 15  Income             

In [115]:
#Checking churn for value counts.
cd.Churn.value_counts()

No     7350
Yes    2650
Name: Churn, dtype: int64

In [116]:
#Creating mapping to turn Yes to 1 and No to 0 for the variables that have yes or no responses
bool_map = {"Yes" : 1, "No" : 0}

In [117]:
# Converting columns with the mapping I created
cd["Techie"] = cd["Techie"].map(bool_map)
cd["Multiple"] = cd["Multiple"].map(bool_map)
cd["OnlineBackup"] = cd["OnlineBackup"].map(bool_map)
cd["DeviceProtection"] = cd["DeviceProtection"].map(bool_map)
cd["TechSupport"] = cd["TechSupport"].map(bool_map)
cd["StreamingTV"] = cd["StreamingTV"].map(bool_map)
cd["StreamingMovies"] = cd["StreamingMovies"].map(bool_map)
cd["Churn"] = cd["Churn"].map(bool_map)

In [118]:
#Putting the columns I will be using together
columns_to_keep = ['Children','InternetService','Contract','Gender','Techie','Multiple','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies','Bandwidth_GB_Year','Churn']

In [119]:
#Create new dataset with only variables I will be using
cdk = cd[columns_to_keep]

In [120]:
cdk

Unnamed: 0_level_0,Children,InternetService,Contract,Gender,Techie,Multiple,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Bandwidth_GB_Year,Churn
CaseOrder,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,0,Fiber Optic,One year,Male,0,0,1,0,0,0,1,904.536110,0
2,1,Fiber Optic,Month-to-month,Female,1,1,0,0,0,1,1,800.982766,1
3,4,DSL,Two Year,Female,1,1,0,0,0,0,1,2054.706961,0
4,1,DSL,Two Year,Male,1,0,0,0,0,1,0,2164.579412,0
5,0,Fiber Optic,Month-to-month,Male,0,0,0,0,1,1,0,271.493436,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9996,3,DSL,Month-to-month,Male,0,1,1,1,0,0,0,6511.252601,0
9997,4,Fiber Optic,Two Year,Male,0,1,1,1,0,1,0,5695.951810,0
9998,1,Fiber Optic,Month-to-month,Female,0,1,1,0,0,0,0,4159.305799,0
9999,1,Fiber Optic,Two Year,Male,0,1,0,0,1,1,1,6468.456752,0


In [121]:
#Separate the explanatory variables (x) from the response variable (Y)
cd_x = cdk.drop(["Churn"],axis=1).copy()
cd_Y = cdk["Churn"].copy()

In [122]:
#Making dummy values columns for Marital, Gender, Contract, and InternetService
Gender_dum = pd.get_dummies(data=cd["Gender"])
Contract_dum = pd.get_dummies(data=cd["Contract"])
InternetService_dum = pd.get_dummies(data=cd["InternetService"])

In [123]:
Contract_dum

Unnamed: 0_level_0,Month-to-month,One year,Two Year
CaseOrder,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,1,0
2,1,0,0
3,0,0,1
4,0,0,1
5,1,0,0
...,...,...,...
9996,1,0,0
9997,0,0,1
9998,1,0,0
9999,0,0,1


In [124]:
InternetService_dum

Unnamed: 0_level_0,DSL,Fiber Optic,None
CaseOrder,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,1,0
2,0,1,0
3,1,0,0
4,1,0,0
5,0,1,0
...,...,...,...
9996,1,0,0
9997,0,1,0
9998,0,1,0
9999,0,1,0


In [125]:
#Inserting the dummy variables in cd_x
cd_x.insert(1, "Gender_Male", Gender_dum.Male)
cd_x.insert(1, "Gender_Female", Gender_dum.Female)
cd_x.insert(1, "Gender_Nonbinary", Gender_dum.Nonbinary)
cd_x.insert(1, "Contract_One_year", Contract_dum['One year'])
cd_x.insert(1, "Contract_Two_year", Contract_dum['Two Year'])
cd_x.insert(1, "Contract_Month-to-month", Contract_dum['Month-to-month'])
cd_x.insert(1, "InternetService_None", InternetService_dum['None'])
cd_x.insert(1, "InternetService_Fiber_Optic", InternetService_dum['Fiber Optic'])
cd_x.insert(1, "InternetService_DSL", InternetService_dum.DSL)

In [126]:
cd_x

Unnamed: 0_level_0,Children,InternetService_DSL,InternetService_Fiber_Optic,InternetService_None,Contract_Month-to-month,Contract_Two_year,Contract_One_year,Gender_Nonbinary,Gender_Female,Gender_Male,...,Contract,Gender,Techie,Multiple,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Bandwidth_GB_Year
CaseOrder,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,1,0,0,0,1,0,0,1,...,One year,Male,0,0,1,0,0,0,1,904.536110
2,1,0,1,0,1,0,0,0,1,0,...,Month-to-month,Female,1,1,0,0,0,1,1,800.982766
3,4,1,0,0,0,1,0,0,1,0,...,Two Year,Female,1,1,0,0,0,0,1,2054.706961
4,1,1,0,0,0,1,0,0,0,1,...,Two Year,Male,1,0,0,0,0,1,0,2164.579412
5,0,0,1,0,1,0,0,0,0,1,...,Month-to-month,Male,0,0,0,0,1,1,0,271.493436
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9996,3,1,0,0,1,0,0,0,0,1,...,Month-to-month,Male,0,1,1,1,0,0,0,6511.252601
9997,4,0,1,0,0,1,0,0,0,1,...,Two Year,Male,0,1,1,1,0,1,0,5695.951810
9998,1,0,1,0,1,0,0,0,1,0,...,Month-to-month,Female,0,1,1,0,0,0,0,4159.305799
9999,1,0,1,0,0,1,0,0,0,1,...,Two Year,Male,0,1,0,0,1,1,1,6468.456752


In [127]:
cd_x = cd_x.drop(["Contract"], axis=1).copy()

In [128]:
cd_x = cd_x.drop(["Gender"], axis=1).copy()

In [129]:
cd_x = cd_x.drop(["InternetService"], axis=1).copy()

In [130]:
cd_x

Unnamed: 0_level_0,Children,InternetService_DSL,InternetService_Fiber_Optic,InternetService_None,Contract_Month-to-month,Contract_Two_year,Contract_One_year,Gender_Nonbinary,Gender_Female,Gender_Male,Techie,Multiple,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Bandwidth_GB_Year
CaseOrder,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,1,904.536110
2,1,0,1,0,1,0,0,0,1,0,1,1,0,0,0,1,1,800.982766
3,4,1,0,0,0,1,0,0,1,0,1,1,0,0,0,0,1,2054.706961
4,1,1,0,0,0,1,0,0,0,1,1,0,0,0,0,1,0,2164.579412
5,0,0,1,0,1,0,0,0,0,1,0,0,0,0,1,1,0,271.493436
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9996,3,1,0,0,1,0,0,0,0,1,0,1,1,1,0,0,0,6511.252601
9997,4,0,1,0,0,1,0,0,0,1,0,1,1,1,0,1,0,5695.951810
9998,1,0,1,0,1,0,0,0,1,0,0,1,1,0,0,0,0,4159.305799
9999,1,0,1,0,0,1,0,0,0,1,0,1,0,0,1,1,1,6468.456752


In [131]:
cd_Y

CaseOrder
1        0
2        1
3        0
4        0
5        1
        ..
9996     0
9997     0
9998     0
9999     0
10000    0
Name: Churn, Length: 10000, dtype: int64

In [132]:
clean_cd = cd_x.copy()
clean_cd.insert(1, column="Churn", value= cd_Y.reset_index().drop('CaseOrder', axis=1))

In [133]:
clean_cd

Unnamed: 0_level_0,Children,Churn,InternetService_DSL,InternetService_Fiber_Optic,InternetService_None,Contract_Month-to-month,Contract_Two_year,Contract_One_year,Gender_Nonbinary,Gender_Female,Gender_Male,Techie,Multiple,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Bandwidth_GB_Year
CaseOrder,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,0,1.0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,1,904.536110
2,1,0.0,0,1,0,1,0,0,0,1,0,1,1,0,0,0,1,1,800.982766
3,4,0.0,1,0,0,0,1,0,0,1,0,1,1,0,0,0,0,1,2054.706961
4,1,1.0,1,0,0,0,1,0,0,0,1,1,0,0,0,0,1,0,2164.579412
5,0,0.0,0,1,0,1,0,0,0,0,1,0,0,0,0,1,1,0,271.493436
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9996,3,0.0,1,0,0,1,0,0,0,0,1,0,1,1,1,0,0,0,6511.252601
9997,4,0.0,0,1,0,0,1,0,0,0,1,0,1,1,1,0,1,0,5695.951810
9998,1,0.0,0,1,0,1,0,0,0,1,0,0,1,1,0,0,0,0,4159.305799
9999,1,0.0,0,1,0,0,1,0,0,0,1,0,1,0,0,1,1,1,6468.456752


In [134]:
clean_cd.to_csv("C:/Users/cklni/Desktop/WGU/D209/Task 2/d209clean2.csv", index=False)

In [135]:
#Assigning the features to X and the response to y
X= cd_x
y= cd_Y

#Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, test_size=0.2, random_state=15, stratify=y)

In [136]:
#Saving the training and testing data sets
X_train.to_csv("C:/Users/cklni/Desktop/WGU/D209/Task 2/Xtrain.csv", index=False)
X_test.to_csv("C:/Users/cklni/Desktop/WGU/D209/Task 2/Xtest.csv", index=False)
y_train.to_csv("C:/Users/cklni/Desktop/WGU/D209/Task 2/ytrain.csv", index=False)
y_test.to_csv("C:/Users/cklni/Desktop/WGU/D209/Task 2/ytest.csv", index=False)

In [137]:
#Create a DecisionTreeClassifier
clf = DecisionTreeClassifier()

#Train the model
clf.fit(X_train, y_train)

#Make predictions on the testing data
y_pred = clf.predict(X_test)

#Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:\n", report)

Accuracy: 0.853
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.90      0.90      1470
           1       0.73      0.71      0.72       530

    accuracy                           0.85      2000
   macro avg       0.81      0.81      0.81      2000
weighted avg       0.85      0.85      0.85      2000



In [138]:
#Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 0.147


In [139]:
#Generate a confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Confusion Matrix:
 [[1329  141]
 [ 153  377]]
