# Task-4

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sqlalchemy import create_engine
from dotenv import load_dotenv
import os
import sys
sys.path.append(os.path.abspath(os.path.join('..')))
from scripts.user_satisfaction_analysis import *
from scripts.user_experience_analysis import aggregate_customer_data, perform_kmeans_clustering as experience_cluster
from scripts.user_engagement_analysis import aggregate_metrics_per_customer, normalize_and_cluster as engagement_cluster

# Load environment variables from .env file

In [4]:

load_dotenv('../venv/venv')

True

# Retrieve database connection details from environment variables

In [5]:
db_user = os.getenv('DB_USER')
db_password = os.getenv('DB_PASSWORD')
db_host = os.getenv('DB_HOST')
db_port = os.getenv('DB_PORT')
db_name = os.getenv('DB_NAME')

# Create the connection string

In [None]:
connection_string = f'postgresql+psycopg2://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}'
engine = create_engine(connection_string)

# Query the data

In [None]:
query = 'SELECT * FROM xdr_data_cleaned'
data_cleaned = pd.read_sql(query, engine)

# Display the data

In [None]:
print(data_cleaned.head())

# Engagement Score and Experience Score to each user

In [None]:
def aggregate_metrics_per_customer(data_cleaned: pd.DataFrame) -> pd.DataFrame:
    aggregated = data_cleaned.groupby('MSISDN/Number').agg({
        'Bearer Id': 'count',
        'Dur. (ms)': 'sum',
        'Total DL (Bytes)': 'sum',
        'Total UL (Bytes)': 'sum'
    }).reset_index()
    
    aggregated['Total Traffic'] = aggregated['Total DL (Bytes)'] + aggregated['Total UL (Bytes)']
    aggregated.columns = ['MSISDN', 'Sessions', 'Duration', 'DL Traffic', 'UL Traffic', 'Total Traffic']
    
    top_10 = {
        'Sessions': aggregated.nlargest(10, 'Sessions'),
        'Duration': aggregated.nlargest(10, 'Duration'),
        'Total Traffic': aggregated.nlargest(10, 'Total Traffic')
    }
    
    return aggregated, top_10

In [None]:
# aggregrate metrics per customer 
aggregated_data, top_10_customers = aggregate_metrics_per_customer(data_cleaned)

In [None]:
# Engagement analysis
engagement_data, _ = aggregate_metrics_per_customer(data_cleaned)
engagement_normalized, engagement_kmeans = engagement_cluster(engagement_data)

In [None]:
# Experience analysis
experience_data = aggregate_customer_data(data_cleaned)
experience_data, experience_kmeans = experience_cluster(experience_data)

In [None]:
# Check shape of engagmeent_data and experience_data 
print(f"Engagement data shape: {engagement_data.shape}")
print(f"Experience data shape: {experience_data.shape}")

In [None]:
# check if both data are aligned with MSISDN
engagement_data.head()

In [None]:
# check if both data are aligned with MSISDN
experience_data.head()

In [None]:
# engagement and experience features 
engagement_features = ['Sessions', 'Duration', 'Total Traffic']
experience_features = ['Avg TCP Retrans', 'Avg RTT', 'Avg Throughput']

In [None]:
# Normalize the data for clustering
scaler = StandardScaler()
engagement_normalized = scaler.fit_transform(engagement_data[engagement_features])
experience_normalized = scaler.fit_transform(experience_data[experience_features])

In [None]:
# Create and fit KMeans models
engagement_kmeans = KMeans(n_clusters=3, random_state=42)
engagement_kmeans.fit(engagement_normalized)

experience_kmeans = KMeans(n_clusters=3, random_state=42)
experience_kmeans.fit(experience_normalized)

In [None]:
msisdn_column = data_cleaned['MSISDN/Number']  

engagement_data = engagement_data.reset_index(drop=True)
experience_data = experience_data.reset_index(drop=True)

In [None]:
scores_data = assign_scores(engagement_data[engagement_features], 
                          experience_data[experience_features], 
                          engagement_kmeans, experience_kmeans,
                          msisdn_column)

scores_data

# Calculate the Satisfaction Score

In [None]:
scores_data, top_10_satisfied = calculate_satisfaction_scores(scores_data)
print("Top 10 Satisfied Customers:")
top_10_satisfied

# Build a Regression Model

In [None]:
model, mse, r2 = build_regression_model(scores_data)
print(f"Model MSE: {mse}")
print(f"Model R2 Score: {r2}")

In [None]:
clustered_data, avg_satisfaction, avg_experience = cluster_satisfaction(scores_data)
print(f"Average Satisfaction Scores per Cluster: {avg_satisfaction}")
print(f"Average Experience Scores per Cluster: {avg_experience}")

fig = plot_clusters(clustered_data)
plt.show()