# Unsupervised model
By: Preston Cusick, Daniel Wendland, Ethan Eckmann

The goal of this model is to provide an additional assistance for the analysis of our project.

# Imports

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import pandas as pd

In [None]:
# Load the churn_predict_B.csv
df1 = pd.read_csv('churn_predict_A.csv')
df2 = pd.read_csv('churn_predict_B.csv')

# Drop rows where 'Probability' is NaN and create a proper copy
df_clean1 = df1.dropna(subset=['Probability']).copy()
df_clean2 = df2.dropna(subset=['Probability']).copy()

# Use only the 'Probability' column
X = df_clean1 [['Probability']]

# This will standardize data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# KMeans clustering
kmeans_model = KMeans(n_clusters=4, random_state=42, n_init=10)
cluster_assignments = kmeans_model.fit_predict(X_scaled)

df_clean1.loc[X.index, 'Cluster'] = cluster_assignments
df_clean2.loc[X.index, 'Cluster'] = cluster_assignments


# Get the average of 'Probability' where 'Cluster' is X
# Need to get the average of each cluster and then manually map to risk groups
average_a = df_clean1[df_clean1['Cluster'] == 0.0]['Probability'].mean()
print(average_a)

average_a = df_clean1[df_clean1['Cluster'] == 1.0]['Probability'].mean()
print(average_a)

average_a = df_clean1[df_clean1['Cluster'] == 2.0]['Probability'].mean()
print(average_a)

average_a = df_clean1[df_clean1['Cluster'] == 3.0]['Probability'].mean()
print(average_a)


# Manually map cluster labels
label_map = {
    0.0: 'Low Risk',
    1.0: 'High Risk',
    2.0: 'Very Low Risk',
    3.0: 'Moderate Risk'
}
df_clean1['Cluster_Label'] = df_clean1['Cluster'].map(label_map)
df_clean2['Cluster_Label'] = df_clean2['Cluster'].map(label_map)

# # Drop 'Cluster' column (the 0.0, 1.0, etc.) if needed
# df_clean1 = df_clean1.drop(columns=['Cluster'])
# df_clean2 = df_clean2.drop(columns=['Cluster'])

# Save the clustered data
df_clean1.to_csv('churn_clusters_A.csv', index=False)
df_clean2.to_csv('churn_clusters_B.csv', index=False)



# Count with high risk
count = len(df_clean1[df_clean1['Cluster_Label'] == 'High Risk'])
print(count)
print(count/(len(df_clean1))*100)
print()
count = len(df_clean1[df_clean1['Cluster_Label'] == 'Moderate Risk'])
print(count)
print(count/(len(df_clean1))*100)
print()
count = len(df_clean1[df_clean1['Cluster_Label'] == 'Low Risk'])
print(count)
print(count/(len(df_clean1))*100)
print()
count = len(df_clean1[df_clean1['Cluster_Label'] == 'Very Low Risk'])
print(count)
print(count/(len(df_clean1))*100)
print()

average_prob = df_clean1['Probability'].mean()
print(f"Average probability: {average_prob}")

0.45204904614246055
0.8317507018562125
0.21438711944322747
0.6331785787931747
595
31.835205992509362

576
30.818619582664525

438
23.434991974317818

260
13.911182450508294

Average probability: 0.5956878877768603
