In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans


In [2]:
df = pd.read_csv('../src/data/aggregated_user_experience.csv')

In [3]:
df.head()

Unnamed: 0,MSISDN/Number,Avg_TCP_Retransmission,Avg_RTT,Avg_Throughput,Most_Common_Handset
0,33601000000.0,20809910.0,46.0,37.0,Huawei P20 Lite Huawei Nova 3E
1,33601000000.0,20809910.0,30.0,48.0,Apple iPhone 7 (A1778)
2,33601000000.0,20809910.0,109.795706,48.0,undefined
3,33601010000.0,1066.0,69.0,204.0,Apple iPhone 5S (A1457)
4,33601010000.0,15079770.0,57.0,20197.5,Apple iPhone Se (A1723)


Step 1: Euclidean Distance Calculation for Engagement and Experience Scores


In [4]:
from sklearn.metrics.pairwise import euclidean_distances

# Perform KMeans clustering for engagement data
kmeans_engagement = KMeans(n_clusters=3, random_state=0).fit(df[['Avg_TCP_Retransmission', 'Avg_RTT', 'Avg_Throughput']])
engagement_cluster_centers = kmeans_engagement.cluster_centers_

# Perform KMeans clustering for experience data
kmeans_experience = KMeans(n_clusters=3, random_state=0).fit(df[['Avg_TCP_Retransmission', 'Avg_RTT', 'Avg_Throughput']])
experience_cluster_centers = kmeans_experience.cluster_centers_

# Compute engagement score: distance to the least engaged cluster (typically the cluster with the highest TCP retransmission and RTT)
engagement_score = []
for i, row in df.iterrows():
    user_data = row[['Avg_TCP_Retransmission', 'Avg_RTT', 'Avg_Throughput']].values.reshape(1, -1)
    distances = euclidean_distances(user_data, engagement_cluster_centers)
    engagement_score.append(np.min(distances))  # Taking the minimum distance to the least engaged cluster

# Compute experience score: distance to the worst experience cluster (typically the cluster with the highest TCP retransmission and RTT)
experience_score = []
for i, row in df.iterrows():
    user_data = row[['Avg_TCP_Retransmission', 'Avg_RTT', 'Avg_Throughput']].values.reshape(1, -1)
    distances = euclidean_distances(user_data, experience_cluster_centers)
    experience_score.append(np.min(distances))  # Taking the minimum distance to the worst experience cluster

# Add the scores to the DataFrame
df['Engagement Score'] = engagement_score
df['Experience Score'] = experience_score


  super()._check_params_vs_input(X, default_n_init=10)
found 0 physical cores < 1
  File "c:\Users\HP EliteBook\Desktop\KAIM_WEEK_2\TellCo-Analysis-and-Growth-Opportunity-Dashboard\.venv\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")
  super()._check_params_vs_input(X, default_n_init=10)


Calculate Satisfaction Score

In [5]:
# Calculate satisfaction score as the average of engagement and experience scores
df['Satisfaction Score'] = (df['Engagement Score'] + df['Experience Score']) / 2

# Report the top 10 satisfied customers based on the satisfaction score
top_satisfied_customers = df[['MSISDN/Number', 'Satisfaction Score']].sort_values(by='Satisfaction Score', ascending=False).head(10)
print(top_satisfied_customers)


       MSISDN/Number  Satisfaction Score
72271   3.366951e+10        2.900230e+07
56192   3.366441e+10        2.898587e+07
32676   3.365878e+10        2.879383e+07
64581   3.366699e+10        2.863577e+07
69194   3.366849e+10        2.861580e+07
26714   3.365088e+10        2.860822e+07
43785   3.366135e+10        2.842433e+07
27234   3.365099e+10        2.841504e+07
94060   3.376106e+10        2.840052e+07
54920   3.366403e+10        2.829569e+07


Build a Regression Model to Predict Satisfaction Score

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Define features and target
X = df[['Engagement Score', 'Experience Score']]
y = df['Satisfaction Score']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict the satisfaction score
y_pred = model.predict(X_test)

# Evaluate the model performance
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")


Mean Squared Error: 3.063569327110254e-17


K-means Clustering on Engagement & Experience Scores

In [7]:
# Perform K-means clustering with k=2 on the engagement and experience scores
X_clustering = df[['Engagement Score', 'Experience Score']]
kmeans_clustering = KMeans(n_clusters=2, random_state=42)
df['Satisfaction Cluster'] = kmeans_clustering.fit_predict(X_clustering)

# Display the cluster centers
print("\nSatisfaction Clustering Centers:")
print(kmeans_clustering.cluster_centers_)


  super()._check_params_vs_input(X, default_n_init=10)



Satisfaction Clustering Centers:
[[  695693.60444382   695693.60444381]
 [16217592.42979299 16217592.42979299]]


Aggregate the Average Satisfaction & Experience Score per Cluster

In [10]:
# Aggregate the average satisfaction and experience score per satisfaction cluster
cluster_aggregation = df.groupby('Satisfaction Cluster')[['Satisfaction Score', 'Experience Score']].mean().reset_index()
print(cluster_aggregation)


   Satisfaction Cluster  Satisfaction Score  Experience Score
0                     0        6.956936e+05      6.956936e+05
1                     1        1.621759e+07      1.621759e+07


Connect to MySQL and Export the Data

In [12]:

import mysql.connector

# Connect to MySQL
conn = mysql.connector.connect(
    host='localhost',
    user='root',
    password='mypassword',
    database='user_scores'
)

# Create a cursor object
cursor = conn.cursor()

# Create table if not exists
cursor.execute("""
CREATE TABLE IF NOT EXISTS user_scores (
    UserID INT PRIMARY KEY,
    Engagement_Score FLOAT,
    Experience_Score FLOAT,
    Satisfaction_Score FLOAT
)
""")

# Insert data into the table
for i, row in df.iterrows():
    cursor.execute("""
    INSERT INTO user_scores (UserID, Engagement_Score, Experience_Score, Satisfaction_Score)
    VALUES (%s, %s, %s, %s)
    """, (row['UserID'], row['Engagement Score'], row['Experience Score'], row['Satisfaction Score']))

# Commit the changes and close the connection
conn.commit()
cursor.close()
conn.close()


DatabaseError: 2003 (HY000): Can't connect to MySQL server on 'localhost:3306' (10061)