Importing Libraries

In [40]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sqlalchemy import create_engine
from dotenv import load_dotenv
import os
import psycopg2

In [3]:
engagement_df = pd.read_csv('engagement_data.csv')
engagement_df.head()

Unnamed: 0,Customer ID,Total Duration (ms),Total Download (Bytes),Total Upload (Bytes),Session Frequency,Total Traffic (Bytes),Engagement Cluster
0,33601000000.0,116720.0,455.940801,36.053108,1,491.993908,0
1,33601000000.0,181230.0,120.755184,36.104459,1,156.859643,0
2,33601000000.0,134969.0,556.659663,39.30682,1,595.966483,0
3,33601010000.0,49878.0,401.993172,20.327526,1,422.320698,0
4,33601010000.0,37104.0,1003.036989,79.623348,2,1082.660337,1


In [4]:
experience_df = pd.read_csv('experience_data.csv')
experience_df.head()

Unnamed: 0,MSISDN/Number,Handset Type,Avg TCP Retransmission (Bytes),Avg RTT (ms),Avg Throughput (kbps),Experience Cluster
0,33601000000.0,Huawei P20 Lite Huawei Nova 3E,21777860.0,46.0,76.0,1
1,33601000000.0,Apple iPhone 7 (A1778),21777860.0,31.0,99.0,1
2,33601000000.0,undefined,21777860.0,123.828174,97.0,1
3,33601010000.0,Apple iPhone 5S (A1457),772117.8,84.0,248.0,0
4,33601010000.0,Apple iPhone Se (A1723),15574350.0,59.5,94.0,1


Task 4.1a - Calculate Engagement Score

In [6]:
# Group by the cluster label and calculate the mean of relevant features to get cluster centers
engagement_clusters_df = engagement_df.groupby('Engagement Cluster')[['Session Frequency', 'Total Duration (ms)', 'Total Traffic (Bytes)']].mean()

# Convert the DataFrame to a numpy array if you need it in the same format as cluster centers
engagement_clusters = engagement_clusters_df.values


In [7]:
from sklearn.metrics.pairwise import euclidean_distances

def calculate_engagement_score(row):
    return euclidean_distances([row], [engagement_clusters[0]])[0][0]

engagement_df['engagement_score'] = engagement_df.apply(lambda row: calculate_engagement_score(row[['Session Frequency', 'Total Duration (ms)', 'Total Traffic (Bytes)']]), axis=1)


In [20]:
engagement_df.rename(columns={'Customer ID': 'MSISDN/Number'}, inplace=True)
engagement_df.head(10)

Unnamed: 0,MSISDN/Number,Total Duration (ms),Total Download (Bytes),Total Upload (Bytes),Session Frequency,Total Traffic (Bytes),Engagement Cluster,engagement_score
0,33601000000.0,116720.0,455.940801,36.053108,1,491.993908,0,10582.618634
1,33601000000.0,181230.0,120.755184,36.104459,1,156.859643,0,75093.021371
2,33601000000.0,134969.0,556.659663,39.30682,1,595.966483,0,28831.912946
3,33601010000.0,49878.0,401.993172,20.327526,1,422.320698,0,56259.513641
4,33601010000.0,37104.0,1003.036989,79.623348,2,1082.660337,1,69036.50588
5,33601010000.0,253983.0,555.207972,60.009249,2,615.217221,1,147845.593304
6,33601010000.0,64180.0,381.330543,41.141959,1,422.472503,0,41957.514485
7,33601010000.0,86399.0,294.085078,38.575279,1,332.660357,0,19738.800762
8,33601010000.0,495702.0,896.560173,76.884397,2,973.44457,1,389564.854776
9,33601020000.0,124854.0,685.372753,47.091008,1,732.463761,0,18718.779884


Cell 4: Task 4.1b - Calculate Experience Score


In [10]:
# Group by the cluster label and calculate the mean of relevant features to get cluster centers
experience_clusters_df = experience_df.groupby('Experience Cluster')[['Avg TCP Retransmission (Bytes)', 'Avg RTT (ms)','Avg Throughput (kbps)']].mean()

# Convert the DataFrame to a numpy array if you need it in the same format as cluster centers
experience_clusters = experience_clusters_df.values

In [15]:

def calculate_experience_score(row):
    return euclidean_distances([row], [experience_clusters[0]])[0][0]

# Apply this function to your DataFrame to calculate the engagement score
experience_df['experience_score'] = experience_df.apply(lambda row: calculate_experience_score(row[['Avg TCP Retransmission (Bytes)', 'Avg RTT (ms)','Avg Throughput (kbps)']]), axis=1)


In [16]:
experience_df.head(10)

Unnamed: 0,MSISDN/Number,Handset Type,Avg TCP Retransmission (Bytes),Avg RTT (ms),Avg Throughput (kbps),Experience Cluster,engagement_score,experience_score
0,33601000000.0,Huawei P20 Lite Huawei Nova 3E,21777860.0,46.0,76.0,1,17309190.0,17309190.0
1,33601000000.0,Apple iPhone 7 (A1778),21777860.0,31.0,99.0,1,17309190.0,17309190.0
2,33601000000.0,undefined,21777860.0,123.828174,97.0,1,17309190.0,17309190.0
3,33601010000.0,Apple iPhone 5S (A1457),772117.8,84.0,248.0,0,3696553.0,3696553.0
4,33601010000.0,Apple iPhone Se (A1723),15574350.0,59.5,94.0,1,11105680.0,11105680.0
5,33601010000.0,Samsung Galaxy A8 (2018),11275840.0,73.914087,3954.0,0,6807167.0,6807167.0
6,33601010000.0,Huawei Mate 10 Pro Porsche Design Huawei Mate 10,21777860.0,14.0,97.0,1,17309190.0,17309190.0
7,33601010000.0,Samsung Galaxy S8 Plus (Sm-G955F),771329.8,52.0,1247.0,0,3697340.0,3697340.0
8,33601010000.0,undefined,21777860.0,123.828174,94.5,1,17309190.0,17309190.0
9,33601020000.0,Apple iPhone X (A1865),21008100.0,62.0,146.0,1,16539430.0,16539430.0


Task 4.2 - Calculate Satisfaction Score and Report Top 10 Satisfied Customers

Merge the DataFrames

In [21]:
# Merge the two DataFrames on the 'MSISDN/Number' column
df_merged = pd.merge(engagement_df[['MSISDN/Number', 'engagement_score']],
                     experience_df[['MSISDN/Number', 'experience_score']],
                     on='MSISDN/Number',
                     how='inner')


Calculate the Satisfaction Score

In [22]:
# Calculate satisfaction score as the average of engagement and experience scores
df_merged['satisfaction_score'] = df_merged[['engagement_score', 'experience_score']].mean(axis=1)


Identify the Top 10 Satisfied Customers

In [25]:
# Top 10 satisfied customers
top_10_satisfied = df_merged.nlargest(10, 'satisfaction_score')

top_10_satisfied


Unnamed: 0,MSISDN/Number,engagement_score,experience_score,satisfaction_score
12352,33625780000.0,18447620.0,11103560.0,14775590.0
85320,33760540000.0,8279244.0,15335800.0,11807520.0
52521,33664690000.0,6182594.0,17309190.0,11745890.0
60526,33667460000.0,5543745.0,17309190.0,11426470.0
50740,33664120000.0,4581304.0,17309190.0,10945250.0
92254,33763880000.0,4281935.0,17309190.0,10795560.0
32168,33659220000.0,3557456.0,17161910.0,10359680.0
13594,33627880000.0,3303033.0,17309190.0,10306110.0
17202,33636540000.0,3272135.0,17309190.0,10290660.0
22186,33650190000.0,3082669.0,17309190.0,10195930.0


Task 4.3 - Build a Regression Model to Predict Satisfaction Score

In [30]:
# Define features and target
features = df_merged[['engagement_score', 'experience_score']]
target = df_merged['satisfaction_score']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Build and train the regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate the model
print(f"Model R^2: {model.score(X_test, y_test)}")


Model R^2: 1.0


Cell 7: Task 4.4 - K-Means Clustering on Engagement & Experience Score (k=2)

In [33]:
# Perform k-means clustering (k=2) on engagement and experience scores
kmeans_satisfaction = KMeans(n_clusters=2, random_state=42)
df_merged['satisfaction_cluster'] = kmeans_satisfaction.fit_predict(df_merged[['engagement_score', 'experience_score']])

# Visualize or output cluster assignments
df_merged[['MSISDN/Number', 'satisfaction_cluster']].head(10)


Unnamed: 0,MSISDN/Number,satisfaction_cluster
0,33601000000.0,1
1,33601000000.0,1
2,33601000000.0,1
3,33601010000.0,0
4,33601010000.0,1
5,33601010000.0,0
6,33601010000.0,1
7,33601010000.0,0
8,33601010000.0,1
9,33601020000.0,1


Task 4.5 - Aggregate Average Satisfaction & Experience Score per Cluster

In [34]:
# Aggregate average satisfaction and experience score per cluster
cluster_aggregation = df_merged.groupby('satisfaction_cluster').agg({
    'satisfaction_score': 'mean',
    'experience_score': 'mean'
}).reset_index()

print(cluster_aggregation)


   satisfaction_cluster  satisfaction_score  experience_score
0                     0        2.364155e+06      4.644981e+06
1                     1        8.625313e+06      1.717127e+07


In [41]:
DB_HOST = os.getenv("DB_HOST")
DB_PORT = os.getenv("DB_PORT")
DB_NAME = os.getenv("DB_NAME")
DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")

In [45]:

# Establish a connection to the PostgreSQL database
        # Create a connection string
connection_string = f"postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"

        # Create an SQLAlchemy engine
engine = create_engine(connection_string)

# Export the dataframe to a table in PostgreSQL
df_merged[['MSISDN/Number', 'engagement_score', 'experience_score', 'satisfaction_score']].to_sql(
    'customer_satisfaction', 
    con=engine, 
    if_exists='replace', 
    index=False
)

# Run a SELECT query to verify
result = pd.read_sql("SELECT * FROM customer_satisfaction LIMIT 10", con=engine)
result.head()


Unnamed: 0,MSISDN/Number,engagement_score,experience_score,satisfaction_score
0,33601000000.0,10582.618634,17309190.0,8659887.0
1,33601000000.0,75093.021371,17309190.0,8692142.0
2,33601000000.0,28831.912946,17309190.0,8669012.0
3,33601010000.0,56259.513641,3696553.0,1876406.0
4,33601010000.0,69036.50588,11105680.0,5587357.0
