# Import Libreries

In [32]:
import pandas as pd 
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Read csv file

In [33]:
df_experience = pd.read_csv('Experiance_Analytics_data.csv')

In [34]:
df_engagement = pd.read_csv('user_engagement.csv')

In [35]:
# Drop the MSISDN/Number column 
features = df_engagement.drop(columns=['MSISDN/Number'])

In [36]:
# Standerdize the features 
scaler = StandardScaler()
df_scaled = scaler.fit_transform(features)

# Perform k-means clustring 
kmeans = KMeans(n_clusters=2, random_state=42)
df_engagement['cluster'] = kmeans.fit_predict(df_scaled)

# Determine the less engagged cluster by comparing cluster cetroids 
centroids = kmeans.cluster_centers_
less_engaged_cluster = np.argmin(np.sum(centroids, axis=1))

# Calculate Euclidean distance from each user to the less engaged cluster centroid 
less_engaged_centroid = centroids[less_engaged_cluster]
df_engagement['engagement_score'] = np.linalg.norm(df_scaled - less_engaged_centroid, axis=1)
print(df_engagement[['MSISDN/Number', 'cluster', 'engagement_score']])

       MSISDN/Number  cluster  engagement_score
0       0.000000e+00        0        791.019742
1       3.360100e+10        1          0.879020
2       3.360100e+10        1          1.099152
3       3.360100e+10        1          0.744604
4       3.360101e+10        1          1.205819
...              ...      ...               ...
95250   3.378996e+10        1          0.903529
95251   3.378997e+10        1          0.861366
95252   3.378998e+10        1          0.811474
95253   3.379000e+10        1          1.099789
95254   3.379000e+10        1          0.832592

[95255 rows x 3 columns]


In [37]:
#Drop non-numeric and identifiercolumns for clustring 
df_features = df_experience.drop(columns=['MSISDN/Number','Handset Manufacturer', 'Handset Type', 'cluster'] )

In [38]:
# Standerdize the features 
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_features)

#Perform K-means clustring 
kmeans = KMeans(n_clusters=2, random_state=42)
df_experience['cluster'] = kmeans.fit_predict(df_scaled) 

#Determine the worst experience cluster by comparing cluster by cetroinds 
centroids = kmeans.cluster_centers_
worst_experience_cluster = np.argmax(np.sum(centroids, axis=1))

#Calculate euclidean distance from each user to the worst experience cluster centroid 
worst_experience_centroid = centroids[worst_experience_cluster]
df_experience['experience_score'] = np.linalg.norm(df_scaled - worst_experience_centroid, axis=1)

print(df_experience[['MSISDN/Number', 'cluster', 'experience_score']])

       MSISDN/Number  cluster  experience_score
0       0.000000e+00        1          3.737168
1       3.360100e+10        1          5.703688
2       3.360100e+10        1          5.725966
3       3.360100e+10        1          5.943282
4       3.360101e+10        1          5.257259
...              ...      ...               ...
95250   3.378996e+10        1          5.929549
95251   3.378997e+10        1          5.599080
95252   3.378998e+10        1          5.926338
95253   3.379000e+10        1          4.826170
95254   3.379000e+10        1          5.649338

[95255 rows x 3 columns]


In [39]:
# Merge the dataframes on MSISDN/Number 
df_merged = pd.merge(df_engagement[['MSISDN/Number', 'engagement_score']], df_experience[['MSISDN/Number', 'experience_score']], on='MSISDN/Number')

In [40]:
# Calculte the Satisfaction Score 
df_merged['satisfaction_score'] = (df_merged['engagement_score'] + df_merged['experience_score'])/2 

In [41]:
#Use the features from both engagement and experience datasets to predict the satisfaction score 
df_new_features = pd.merge(df_engagement.drop(columns=['MSISDN/Number', 'cluster', 'engagement_score']), 
                       df_experience.drop(columns=['MSISDN/Number', 'Handset Manufacturer', 'Handset Type', 'cluster', 'experience_score']), 
                       left_index=True, right_index=True)

In [42]:
#Import some more important libraries 
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score

In [43]:
#Perform PCA 
pca = PCA(n_components=2)
principal_data = pca.fit(df_new_features)

In [44]:
# Split the data into training and testing sets 
x = df_new_features
y = df_merged['satisfaction_score']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [45]:
# Train a LinerRegration model
regressor = LinearRegression()
model_regressor =  regressor.fit(x_train, y_train)

#predict on the test sets 
y_pred = regressor.predict(x_test)

In [46]:
# Evaluate the model 
mse = mean_squared_error(y_test,y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean squared Error:{mse}")
print(f"R2 score:{r2}")

Mean squared Error:0.4507727201173588
R2 score:0.5828576098306918


In [47]:
# Display the coefficients
coefficients = pd.DataFrame(regressor.coef_, x.columns, columns=['Coefficient'])
print(coefficients)

                              Coefficient
Number_of_xDR_Sessions      -4.221040e-21
Session_Duration             1.156400e-06
Social_Media_Total           3.890991e-08
Google_Total                 8.662677e-09
Email_Total                  4.001614e-08
Youtube_Total               -1.387061e-09
Netflix_Total               -1.771486e-09
Gaming_Total                -7.454197e-09
Other_Total                  2.020304e-10
total_traffic                7.733704e-09
Avg RTT DL (ms)              1.657518e-09
Avg RTT UL (ms)             -2.207566e-09
Avg Bearer TP DL (kbps)     -1.068248e-05
Avg Bearer TP UL (kbps)     -6.816850e-07
TCP DL Retrans. Vol (Bytes)  4.964973e-08
TCP UL Retrans. Vol (Bytes)  1.012316e-06
Activity Duration UL (ms)    3.777019e-08
Activity Duration DL (ms)    5.792618e-08
Avg RTT DL                  -5.500481e-10
Avg Bearer TP               -1.136416e-05
Avg TCP Retrans              5.262064e-08
Avg Activity Duration       -2.531423e-08
Avg RTT                     -5.500

In [48]:
#Combine engagement and experience scores 
df_scores = df_merged[['engagement_score', 'experience_score']]

#perform K_means clustring 
kmeans_score = KMeans(n_clusters=2, random_state=42)
df_merged['score_cluster'] = kmeans_score.fit_predict(df_scores)

print(df_merged)

       MSISDN/Number  engagement_score  experience_score  satisfaction_score  \
0       0.000000e+00        791.019742          3.737168          397.378455   
1       3.360100e+10          0.879020          5.703688            3.291354   
2       3.360100e+10          1.099152          5.725966            3.412559   
3       3.360100e+10          0.744604          5.943282            3.343943   
4       3.360101e+10          1.205819          5.257259            3.231539   
...              ...               ...               ...                 ...   
95250   3.378996e+10          0.903529          5.929549            3.416539   
95251   3.378997e+10          0.861366          5.599080            3.230223   
95252   3.378998e+10          0.811474          5.926338            3.368906   
95253   3.379000e+10          1.099789          4.826170            2.962979   
95254   3.379000e+10          0.832592          5.649338            3.240965   

       score_cluster  
0               

In [49]:
# Aggregate average satisfaction and experience score per cluster
cluster_aggregation = df_merged.groupby('score_cluster')[['satisfaction_score', 'experience_score']].mean().reset_index()
print(cluster_aggregation)

   score_cluster  satisfaction_score  experience_score
0              0            4.370431          4.771973
1              1            3.368777          5.612087


In [50]:
x['satisfaction_score'] = y

In [53]:
x.to_csv('make_regressor.csv',index=False)