In [1]:
import sys
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../scripts')))
from utils import missing_values_table, fix_outlier, convert_bytes_to_megabytes, convert_ms_to_seconds


### calling load_data_from_postgres function 

In [4]:
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../scripts')))

from load_data import load_data_from_postgres

In [5]:
query = "SELECT * FROM xdr_data"

In [6]:
df_postgres = load_data_from_postgres(query)
print("Data loaded using psycopg2:")

Data loaded using psycopg2:


### filtering columns that are needed for user experience analysis

In [7]:
user_experience_columns = [
    'IMSI',
    'Handset Type',
    'Avg RTT DL (ms)',
    'Avg RTT UL (ms)',
    'Avg Bearer TP DL (kbps)',
    'Avg Bearer TP UL (kbps)',
    'TCP DL Retrans. Vol (Bytes)',
    'TCP UL Retrans. Vol (Bytes)'
]


In [8]:
# Create the df_user_experience DataFrame with the selected columns
df_user_experience = df_postgres[user_experience_columns].copy()

### EDA on the extracted data

In [10]:
df_user_experience.head()

Unnamed: 0,IMSI,Handset Type,Avg RTT DL (ms),Avg RTT UL (ms),Avg Bearer TP DL (kbps),Avg Bearer TP UL (kbps),TCP DL Retrans. Vol (Bytes),TCP UL Retrans. Vol (Bytes)
0,208201400000000.0,Samsung Galaxy A5 Sm-A520F,42.0,5.0,23.0,44.0,,
1,208201900000000.0,Samsung Galaxy J5 (Sm-J530),65.0,5.0,16.0,26.0,,
2,208200300000000.0,Samsung Galaxy A8 (2018),,,6.0,9.0,,
3,208201400000000.0,undefined,,,44.0,44.0,,
4,208201400000000.0,Samsung Sm-G390F,,,6.0,9.0,,


In [11]:
df_user_experience.shape

(150001, 8)

In [12]:
df_user_experience.info

<bound method DataFrame.info of                 IMSI                 Handset Type  Avg RTT DL (ms)  \
0       2.082014e+14   Samsung Galaxy A5 Sm-A520F             42.0   
1       2.082019e+14  Samsung Galaxy J5 (Sm-J530)             65.0   
2       2.082003e+14     Samsung Galaxy A8 (2018)              NaN   
3       2.082014e+14                    undefined              NaN   
4       2.082014e+14             Samsung Sm-G390F              NaN   
...              ...                          ...              ...   
149996  2.082022e+14  Apple iPhone 8 Plus (A1897)             32.0   
149997  2.082019e+14      Apple iPhone Se (A1723)             27.0   
149998  2.082017e+14      Apple iPhone Xs (A2097)             43.0   
149999  2.082021e+14               Huawei Fig-Lx1             37.0   
150000           NaN                         None              NaN   

        Avg RTT UL (ms)  Avg Bearer TP DL (kbps)  Avg Bearer TP UL (kbps)  \
0                   5.0                     23.0  

### checking for missing values using imported function missing_values_table

In [13]:
missing_values_table(df_user_experience)


Your selected dataframe has 8 columns.
There are 8 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
TCP UL Retrans. Vol (Bytes),96649,64.4
TCP DL Retrans. Vol (Bytes),88146,58.8
Avg RTT DL (ms),27829,18.6
Avg RTT UL (ms),27812,18.5
Handset Type,572,0.4
IMSI,570,0.4
Avg Bearer TP DL (kbps),1,0.0
Avg Bearer TP UL (kbps),1,0.0


## cleaning the data by using different techniques 

In [15]:
df_user_experience.dropna(subset=['IMSI'], inplace=True)
df_user_experience.dropna(subset=['Handset Type'], inplace=True)
missing_values_table(df_user_experience)


Your selected dataframe has 8 columns.
There are 4 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
TCP UL Retrans. Vol (Bytes),96432,64.5
TCP DL Retrans. Vol (Bytes),87937,58.8
Avg RTT DL (ms),27693,18.5
Avg RTT UL (ms),27675,18.5


### Filling the missing Values of the above fields with the mean of the respective columns

In [18]:
# Calculate mean values
mean_rtt_dl = df_user_experience['TCP UL Retrans. Vol (Bytes)'].mean()
mean_rtt_ul = df_user_experience['TCP DL Retrans. Vol (Bytes)'].mean()
mean_rtt_dl = df_user_experience['Avg RTT UL (ms)'].mean()
mean_rtt_ul = df_user_experience['Avg RTT DL (ms)'].mean()

# Fill missing values with mean
df_user_experience['Avg RTT DL (ms)'].fillna(mean_rtt_dl, inplace=True)
df_user_experience['Avg RTT UL (ms)'].fillna(mean_rtt_ul, inplace=True)
df_user_experience['TCP UL Retrans. Vol (Bytes)'].fillna(mean_rtt_dl, inplace=True)
df_user_experience['TCP DL Retrans. Vol (Bytes)'].fillna(mean_rtt_ul, inplace=True)

In [19]:
missing_values_table(df_user_experience)

Your selected dataframe has 8 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values


## fixing outliers 

In [20]:
user_experience_columns_to_apply_quartiles = ['Avg RTT DL (ms)',
    'Avg RTT UL (ms)',
    'Avg Bearer TP DL (kbps)',
    'Avg Bearer TP UL (kbps)',
    'TCP DL Retrans. Vol (Bytes)',
    'TCP UL Retrans. Vol (Bytes)']

In [22]:
# Apply the fix_outlier function to each specified column
for column in user_experience_columns_to_apply_quartiles:
    if column in df_user_experience.columns:
        df_user_experience = fix_outlier(df_user_experience, column)

## Formatting the data

### Byte to Megabyte conversion

In [23]:
byte_columns = [
    'TCP DL Retrans. Vol (Bytes)',
    'TCP UL Retrans. Vol (Bytes)']


In [24]:
for column in byte_columns:
    if column in df_user_experience.columns:
        df_user_experience[column] = df_user_experience[column].apply(convert_bytes_to_megabytes)

### converting milliseconds to seconds

In [25]:
millisecond_columns = [
    'Avg RTT DL (ms)',
    'Avg RTT UL (ms)'
]

In [27]:
for column in millisecond_columns:
    if column in df_user_experience.columns:
        df_user_experience[column] = df_user_experience[column].apply(convert_ms_to_seconds)

### renaming the columns

In [28]:
df_user_experience.rename(columns=lambda x: x.replace('Bytes', 'Megabytes') if 'Bytes' in x else x, inplace=True)
df_user_experience.rename(columns=lambda x: x.replace('(ms)', '(s)') if '(ms)' in x else x, inplace=True)

In [29]:
df_user_experience.head()

Unnamed: 0,IMSI,Handset Type,Avg RTT DL (s),Avg RTT UL (s),Avg Bearer TP DL (kbps),Avg Bearer TP UL (kbps),TCP DL Retrans. Vol (Megabytes),TCP UL Retrans. Vol (Megabytes)
0,208201400000000.0,Samsung Galaxy A5 Sm-A520F,0.042,0.005,23.0,44.0,0.135123,3.678427
1,208201900000000.0,Samsung Galaxy J5 (Sm-J530),0.065,0.005,16.0,26.0,0.135123,3.678427
2,208200300000000.0,Samsung Galaxy A8 (2018),764.055136,20826.089683,11.0,18.0,0.135123,3.678427
3,208201400000000.0,undefined,764.055136,20826.089683,44.0,44.0,0.135123,3.678427
4,208201400000000.0,Samsung Sm-G390F,764.055136,20826.089683,11.0,18.0,0.135123,3.678427
