## User Experience Analysis

#### importing libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import sys
import os

#### read cleaned csv data

In [4]:
try:
    df = pd.read_csv('../data/clean_data.csv')
except BaseException:
    logging.error('either file not found or wrong format')
    
print(df.shape)
df = df.dropna()
df.info()

(150001, 45)
<class 'pandas.core.frame.DataFrame'>
Index: 120739 entries, 0 to 149999
Data columns (total 45 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   Bearer Id                       120739 non-null  float64
 1   Start                           120739 non-null  object 
 2   Start ms                        120739 non-null  float64
 3   End                             120739 non-null  object 
 4   End ms                          120739 non-null  float64
 5   Dur. (ms)                       120739 non-null  float64
 6   IMSI                            120739 non-null  float64
 7   MSISDN/Number                   120739 non-null  float64
 8   IMEI                            120739 non-null  float64
 9   Last Location Name              120739 non-null  object 
 10  Avg RTT DL (ms)                 120739 non-null  float64
 11  Avg RTT UL (ms)                 120739 non-null  float64
 12  Avg Bear

#### Aggregate, per customer, the following information (treat missing & outliers by replacing by the mean or the mode of the corresponding variable):
- Average TCP retransmission
- Average RTT
- Handset type
- Average throughput


In [None]:
# Replace missing values in each column with the mean or mode
df['TCP_retransmission'].fillna(df['TCP_retransmission'].mean(), inplace=True)
df['RTT'].fillna(df['RTT'].mean(), inplace=True)
df['Handset_type'].fillna(df['Handset_type'].mode()[0], inplace=True)
df['throughput'].fillna(df['throughput'].mean(), inplace=True)

In [None]:
from scipy.stats import zscore

# Calculate z-scores for each numeric column
z_scores = zscore(df[['TCP_retransmission', 'RTT', 'throughput']])

# Set a threshold for identifying outliers (e.g., 3 standard deviations)
threshold = 3
outliers = (abs(z_scores) > threshold).any(axis=1)

# Replace outliers with the mean
df.loc[outliers, ['TCP_retransmission', 'RTT', 'throughput']] = df.mean()

In [None]:
# Group by customer and calculate the required aggregations
customer_aggregated = df.groupby('Customer_ID').agg({
    'TCP_retransmission': 'mean',
    'RTT': 'mean',
    'Handset_type': lambda x: x.mode().iloc[0],  # Use mode for categorical variable
    'throughput': 'mean'
}).reset_index()


print(customer_aggregated)
