In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import silhouette_score, silhouette_samples
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import calinski_harabasz_score
from sklearn_extra.cluster import KMedoids  # PAM


import warnings

warnings.filterwarnings("ignore")

encoder = OrdinalEncoder()
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering


sns.set(font_scale=1.5)
sns.set_style("darkgrid")
sns.set_palette("pastel")
plt.rcParams["figure.figsize"] = [15, 7]

In [8]:
data = pd.read_csv("data/WA_Fn-UseC_-Telco-Customer-Churn.csv")
data = data.drop(columns="customerID")
data.drop_duplicates(inplace=True)
data.drop("TotalCharges", axis=1, inplace=True)
nominal_features = ("PaperlessBilling", "Dependents", "Partner", "Churn")
for feature in nominal_features:
    data[feature] = data[feature].map({"Yes": 1, "No": 0})

data = pd.get_dummies(
    data,
    columns=[
        "InternetService",
        "OnlineSecurity",
        "OnlineBackup",
        "DeviceProtection",
        "TechSupport",
        "StreamingTV",
        "StreamingMovies",
        "Contract",
        "PaymentMethod",
    ],
)

# No internet service column has the same values (we checked it before) for OnlineSecurity, OnlineBackup, DeviceProtection,
# TechSupport, StreamingTV and StreamingMovies

data.drop(
    columns=[
        "OnlineSecurity_No internet service",
        "OnlineBackup_No internet service",
        "DeviceProtection_No internet service",
        "TechSupport_No internet service",
        "StreamingTV_No internet service",
        "StreamingMovies_No internet service",
    ],
    inplace=True,
)
    
df_cluster = data.copy()
scaler = StandardScaler()
df_cluster[["tenure", "MonthlyCharges"]] = scaler.fit_transform(
    df_cluster[["tenure", "MonthlyCharges"]]
)

df_cluster[
    df_cluster.columns[~df_cluster.columns.isin(["tenure", "MonthlyCharges"])]
] = encoder.fit_transform(
    df_cluster[
        df_cluster.columns[~df_cluster.columns.isin(["tenure", "MonthlyCharges"])]
    ]
)
# take data without churn columns:
df_cluster = df_cluster[df_cluster.columns[:-1]]



In [9]:
data

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,PaperlessBilling,MonthlyCharges,Churn,...,StreamingTV_Yes,StreamingMovies_No,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,Female,0,1,0,1,No,No phone service,1,29.85,0,...,0,1,0,1,0,0,0,0,1,0
1,Male,0,0,0,34,Yes,No,0,56.95,0,...,0,1,0,0,1,0,0,0,0,1
2,Male,0,0,0,2,Yes,No,1,53.85,1,...,0,1,0,1,0,0,0,0,0,1
3,Male,0,0,0,45,No,No phone service,0,42.30,0,...,0,1,0,0,1,0,1,0,0,0
4,Female,0,0,0,2,Yes,No,1,70.70,1,...,0,1,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,1,1,24,Yes,Yes,1,84.80,0,...,1,0,1,0,1,0,0,0,0,1
7039,Female,0,1,1,72,Yes,Yes,1,103.20,0,...,1,0,1,0,1,0,0,1,0,0
7040,Female,0,1,1,11,No,No phone service,1,29.60,0,...,0,1,0,1,0,0,0,0,1,0
7041,Male,1,1,0,4,Yes,Yes,1,74.40,1,...,0,1,0,1,0,0,0,0,0,1


In [3]:
df_cluster

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges
0,0.0,0.0,1.0,0.0,-1.282728,0.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,-1.164135
1,1.0,0.0,0.0,0.0,0.062387,1.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,3.0,-0.262811
2,1.0,0.0,0.0,0.0,-1.241967,1.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,-0.365914
3,1.0,0.0,0.0,0.0,0.510759,0.0,1.0,0.0,2.0,0.0,2.0,2.0,0.0,0.0,1.0,0.0,0.0,-0.750058
4,0.0,0.0,0.0,0.0,-1.241967,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.194503
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,1.0,0.0,1.0,1.0,-0.345224,1.0,2.0,0.0,2.0,0.0,2.0,2.0,2.0,2.0,1.0,1.0,3.0,0.663458
7039,0.0,0.0,1.0,1.0,1.611307,1.0,2.0,1.0,0.0,2.0,2.0,0.0,2.0,2.0,1.0,1.0,1.0,1.275428
7040,0.0,0.0,1.0,1.0,-0.875118,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,-1.172450
7041,1.0,1.0,1.0,0.0,-1.160445,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,0.317562


## MDS

In [None]:
from sklearn.manifold import MDS

In [5]:
mds_transform = MDS(n_components=10)
mds = mds_transform.fit_transform(df_cluster)

NameError: name 'MDS' is not defined