In [None]:

# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import files

# Upload the file (if you are using this in Google Colab)
uploaded = files.upload()

# Load the dataset
file_name = "Telco_customer_churn.xlsx"  # This should be the name of the uploaded file
df = pd.read_excel(file_name)

# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(df.head())

# Handling missing values (if any)
df = df.dropna()

# Encode categorical variables
label_encoders = {}
categorical_cols = df.select_dtypes(include=['object']).columns

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Scale numerical variables
scaler = StandardScaler()
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Perform KMeans clustering
num_clusters = 5  # You can adjust the number of clusters based on your analysis
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
df['Cluster'] = kmeans.fit_predict(df)

# Evaluate clustering using silhouette score
sil_score = silhouette_score(df, df['Cluster'])
print(f"Silhouette Score for {num_clusters} clusters: {sil_score}")

# Visualize clustering results
sns.pairplot(df, hue='Cluster', palette='Set2')
plt.show()

# Analyze clusters
print("Cluster analysis:")
cluster_summary = df.groupby('Cluster').mean()
print(cluster_summary)

# Check how churn is distributed across clusters
churn_distribution = df.groupby('Cluster')['Churn'].value_counts(normalize=True).unstack()
print("Churn distribution across clusters:")
print(churn_distribution)