In [1]:
import pandas as pd
import numpy as np

# Load data
df = pd.read_csv('CDR-Call-Details.csv')

# Drop columns
df = df.drop(columns=['Phone Number', 'CustServ Calls', 'Account Length'])

# Drop duplicates
df = df.drop_duplicates()

# Handle missing values
df = df.dropna()

# Remove outliers
df_numeric = df.select_dtypes(include=[np.number])
Q1 = df_numeric.quantile(0.25)
Q3 = df_numeric.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df_no_outliers = df_numeric[~((df_numeric < lower_bound) | (df_numeric > upper_bound)).any(axis=1)]
df_cleaned = df_no_outliers.join(df[df.columns.difference(df_numeric.columns)])

# Remove rows where 'Churn' is True
df_cleaned = df_cleaned[df_cleaned['Churn'] != True]

# Drop 'Churn' column
df_cleaned = df_cleaned.drop(columns='Churn')

# Reset column names
df_cleaned.columns = df_cleaned.columns.str.replace(' ', '_')

print(df_cleaned.head())
print(df_cleaned.shape)


   VMail_Message  Day_Mins  Day_Calls  Day_Charge  Eve_Mins  Eve_Calls  \
0             25     265.1        110       45.07     197.4         99   
1             26     161.6        123       27.47     195.5        103   
2              0     243.4        114       41.38     121.2        110   
3              0     299.4         71       50.90      61.9         88   
4              0     166.7        113       28.34     148.3        122   

   Eve_Charge  Night_Mins  Night_Calls  Night_Charge  Intl_Mins  Intl_Calls  \
0       16.78       244.7           91         11.01       10.0           3   
1       16.62       254.4          103         11.45       13.7           3   
2       10.30       162.6          104          7.32       12.2           5   
3        5.26       196.9           89          8.86        6.6           7   
4       12.61       186.9          121          8.41       10.1           3   

   Intl_Charge  
0         2.70  
1         3.70  
2         3.29  
3         1.

In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering

# Extract features for clustering
X = df_cleaned.values

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply AgglomerativeClustering with specific parameters
agglo = AgglomerativeClustering(n_clusters=25, linkage='single')

# Fit the model and predict the cluster labels
df_cleaned['result'] = agglo.fit_predict(X_scaled)


In [4]:
df_cleaned['result'].value_counts()

result
0     14066
4        10
3         6
18        6
1         5
8         4
2         4
11        4
5         4
6         3
9         2
14        2
20        2
24        2
7         2
12        2
21        2
22        2
10        2
13        2
23        1
16        1
15        1
19        1
17        1
Name: count, dtype: int64

In [5]:
import pandas as pd
import numpy as np
from sklearn.utils import resample

# Assuming df_cleaned is your DataFrame with features and the 'result' column
X = df_cleaned.drop(columns=['result'])  # Features
y = df_cleaned['result']  # Cluster labels

# Combine X and y into a single DataFrame for easier manipulation
df_combined = pd.concat([X, y], axis=1)

# Identify minority clusters
cluster_counts = y.value_counts()
minority_clusters = cluster_counts[cluster_counts < 10].index  # Adjust the threshold as needed

# Initialize a list to hold the balanced data
df_balanced = df_combined[df_combined['result'] == cluster_counts.idxmax()]  # Start with the majority class

# Perform random oversampling on each minority cluster
for cluster in minority_clusters:
    # Select data points belonging to the current minority cluster
    df_cluster = df_combined[df_combined['result'] == cluster]
    
    # Perform oversampling (duplicate the minority class samples)
    df_oversampled = resample(df_cluster, 
                              replace=True,  # Sample with replacement
                              n_samples=10,  # Increase to 10 samples (or adjust as needed)
                              random_state=42)
    
    # Append the oversampled data to the balanced dataset
    df_balanced = pd.concat([df_balanced, df_oversampled])

# Add the other minority clusters without resampling to maintain the original data
for cluster in cluster_counts.index:
    if cluster not in minority_clusters:
        df_balanced = pd.concat([df_balanced, df_combined[df_combined['result'] == cluster]])

# The resulting balanced DataFrame
df_balanced = df_balanced.reset_index(drop=True)


In [6]:
df_balanced['result'].value_counts()

result
0     28132
3        10
18       10
1        10
8        10
2        10
11       10
5        10
6        10
9        10
14       10
20       10
24       10
7        10
12       10
21       10
22       10
10       10
13       10
23       10
16       10
15       10
19       10
17       10
4        10
Name: count, dtype: int64

In [7]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
import numpy as np



# Preprocessing
# Assuming the data needs to be scaled; adjust as necessary
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_balanced)

# Define the range for n_clusters
n_clusters_list = [25]  # You can expand this list if you want to try other values

best_score = -np.inf
best_n_clusters = None

# Manual grid search
for n_clusters in n_clusters_list:
    # Create the model with fixed linkage
    model = AgglomerativeClustering(linkage='single', n_clusters=n_clusters)
    
    # Fit the model
    labels = model.fit_predict(X_scaled)
    
    # Calculate silhouette score
    score = silhouette_score(X_scaled, labels)
    
    print(f'n_clusters={n_clusters}, Silhouette Score={score}')
    
    # Check if this is the best score
    if score > best_score:
        best_score = score
        best_n_clusters = n_clusters

# Best parameters and best score
print("Best parameters:", {'n_clusters': best_n_clusters})
print("Best silhouette score:", best_score)


n_clusters=25, Silhouette Score=0.40306375102833303
Best parameters: {'n_clusters': 25}
Best silhouette score: 0.40306375102833303


In [8]:
df_balanced

Unnamed: 0,VMail_Message,Day_Mins,Day_Calls,Day_Charge,Eve_Mins,Eve_Calls,Eve_Charge,Night_Mins,Night_Calls,Night_Charge,Intl_Mins,Intl_Calls,Intl_Charge,result
0,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.70,0
1,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.70,0
2,0,243.4,114,41.38,121.2,110,10.30,162.6,104,7.32,12.2,5,3.29,0
3,0,299.4,71,50.90,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,0
4,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28367,0,448.4,94,49.48,444.4,409,40.44,48.8,98,8.44,4.8,4,4.08,4
28368,48,484.4,88,44.49,498.0,408,46.88,94.8,46,4.44,4.8,8,4.08,4
28369,0,484.0,66,44.44,460.6,408,48.68,44.9,44,8.84,0.0,0,0.00,4
28370,44,499.4,400,88.88,444.8,408,48.88,68.4,94,4.96,4.4,4,4.48,4


In [9]:
df.to_csv('upsampled.csv', index = False)