In [3]:
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import pandas as pd

# Load the dataset
file_path = r'downsampled_upsampled.csv'
df = pd.read_csv(file_path)

# Drop the 'Churn' column for clustering
X = df.drop(columns=['Churn'])

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Optional: Apply PCA to reduce dimensionality
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X_scaled)

# Gaussian Mixture Model Clustering with 25 components
gmm = GaussianMixture(n_components=25, random_state=42)
clusters = gmm.fit_predict(X_pca)

# Add the cluster labels to the dataset
df['Cluster'] = clusters

# Display the number of clusters found
n_clusters = len(set(clusters))
n_clusters, df.head()

(25,
    VMail Message  Day Mins  Day Calls  Day Charge  Eve Mins  Eve Calls  \
 0              0    469.80         90      48.480     486.0        488   
 1             29    282.20        328      26.320     296.8        286   
 2              0    202.00        224      22.420     224.4        242   
 3             28    282.20         92      28.820     262.2         98   
 4              0    222.11         26      22.211     222.2        222   
 
    Eve Charge  Night Mins  Night Calls  Night Charge  Intl Mins  Intl Calls  \
 0     48.4600       440.8          406         9.490      44.60           6   
 1     26.6900       326.2          209        20.200       8.20           8   
 2      9.3400       244.3          204         4.290      22.20           4   
 3     28.2800       286.9           86        22.920      22.20           2   
 4    211.1111      2110.2         2211        22.211     211.11          11   
 
    Intl Charge  Churn  Cluster  
 0         8.48  False     

In [4]:
df

Unnamed: 0,VMail Message,Day Mins,Day Calls,Day Charge,Eve Mins,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,Churn,Cluster
0,0,469.80,90,48.480,486.0,488,48.4600,440.80,406,9.490,44.60,6,8.480,False,13
1,29,282.20,328,26.320,296.8,286,26.6900,326.20,209,20.200,8.20,8,2.320,False,21
2,0,202.00,224,22.420,224.4,242,9.3400,244.30,204,4.290,22.20,4,4.040,False,1
3,28,282.20,92,28.820,262.2,98,28.2800,286.90,86,22.920,22.20,2,8.000,False,0
4,0,222.11,26,22.211,222.2,222,211.1111,2110.20,2211,22.211,211.11,11,2.290,False,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21173,0,487.60,85,54.890,404.4,84,47.4400,464.40,79,44.890,8.80,4,4.580,False,0
21174,26,222.80,228,86.980,288.2,82,29.8600,268.90,99,2.880,9.00,8,2.280,False,22
21175,112,262.20,209,22.200,202.2,29,22.1160,2112.11,112,20.220,11.90,2,2.200,False,17
21176,0,222.80,200,20.080,299.2,208,26.9800,222.20,229,20.980,22.80,2,8.290,False,3


In [5]:
df = df[df['Churn'] != True]

In [6]:
df=df.drop(columns=['Churn'])  # Drop non-numerical or irrelevant columns

In [7]:
df = df.iloc[:, 1:]

In [8]:
df

Unnamed: 0,Day Mins,Day Calls,Day Charge,Eve Mins,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,Cluster
0,469.80,90,48.480,486.0,488,48.4600,440.80,406,9.490,44.60,6,8.480,13
1,282.20,328,26.320,296.8,286,26.6900,326.20,209,20.200,8.20,8,2.320,21
2,202.00,224,22.420,224.4,242,9.3400,244.30,204,4.290,22.20,4,4.040,1
3,282.20,92,28.820,262.2,98,28.2800,286.90,86,22.920,22.20,2,8.000,0
4,222.11,26,22.211,222.2,222,211.1111,2110.20,2211,22.211,211.11,11,2.290,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...
21173,487.60,85,54.890,404.4,84,47.4400,464.40,79,44.890,8.80,4,4.580,0
21174,222.80,228,86.980,288.2,82,29.8600,268.90,99,2.880,9.00,8,2.280,22
21175,262.20,209,22.200,202.2,29,22.1160,2112.11,112,20.220,11.90,2,2.200,17
21176,222.80,200,20.080,299.2,208,26.9800,222.20,229,20.980,22.80,2,8.290,3


In [9]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from scipy.stats import mode

In [10]:
# Step 2: Extract the true labels and clustering results
true_labels = df['Cluster'].to_numpy()  # This is your true label column
cluster_labels = df['Cluster'].to_numpy()

In [18]:
# Step 3: Set the number of clusters
n_clusters = 25  # Since you have 25 clusters

# Step 4: Match the clusters to the true labels
def match_labels(true_labels, cluster_labels, n_clusters):
    labels = np.zeros_like(cluster_labels)
    
    for i in range(n_clusters):
        mask = (cluster_labels == i)
        
        if np.sum(mask) > 0:  # Ensure there's data in this cluster
            cluster_true_labels = true_labels[mask]
            
            # Check if cluster_true_labels is empty
            if len(cluster_true_labels) == 0:
                print(f"Cluster {i} has no corresponding true labels.")
                continue
            
            # Calculate mode safely
            mode_result = mode(cluster_true_labels)
            
            # Check if mode_result has valid values
            if mode_result.count.size > 0 and mode_result.count[0] > 0:  # Check if count is an array and has values
                new_label = mode_result.mode[0]  # Safely access the first mode
                labels[mask] = new_label
            else:
                print(f"Cluster {i} returned an empty mode.")
                
    return labels

# Match the labels
matched_labels = match_labels(true_labels, cluster_labels, n_clusters)

# Step 5: Calculate the accuracy
accuracy = accuracy_score(true_labels, matched_labels)

print(f"Accuracy: {accuracy:.2f}")

IndexError: invalid index to scalar variable.