In [14]:
import pandas as pd
from sqlalchemy import create_engine

database_name = 'telecom'
table_name= 'xdr_data'

connection_params = { "host": "localhost", "user": "postgres", "password": "admin",
                    "port": "5432", "database": database_name}

engine = create_engine(f"postgresql+psycopg2://{connection_params['user']}:{connection_params['password']}@{connection_params['host']}:{connection_params['port']}/{connection_params['database']}")


## Loading the data and screening it

In [28]:
# str or SQLAlchemy Selectable (select or text object)
sql_query = 'SELECT * FROM xdr_data'

df = pd.read_sql(sql_query, con= engine)

In [None]:
# To get a concise summary of the dataset i.e. data types, missing data,..
df.info()
pd.set_option('display.max_rows', None)

In [None]:
# Display the first 10 rows of the dataframe
df.head(10)

In [46]:
#List of column names
print(df.columns)

Index(['Bearer Id', 'Start', 'Start ms', 'End', 'End ms', 'Dur. (ms)', 'IMSI',
       'MSISDN/Number', 'IMEI', 'Last Location Name', 'Avg RTT DL (ms)',
       'Avg RTT UL (ms)', 'Avg Bearer TP DL (kbps)', 'Avg Bearer TP UL (kbps)',
       'TCP DL Retrans. Vol (Bytes)', 'TCP UL Retrans. Vol (Bytes)',
       'DL TP < 50 Kbps (%)', '50 Kbps < DL TP < 250 Kbps (%)',
       '250 Kbps < DL TP < 1 Mbps (%)', 'DL TP > 1 Mbps (%)',
       'UL TP < 10 Kbps (%)', '10 Kbps < UL TP < 50 Kbps (%)',
       '50 Kbps < UL TP < 300 Kbps (%)', 'UL TP > 300 Kbps (%)',
       'HTTP DL (Bytes)', 'HTTP UL (Bytes)', 'Activity Duration DL (ms)',
       'Activity Duration UL (ms)', 'Dur. (ms).1', 'Handset Manufacturer',
       'Handset Type', 'Nb of sec with 125000B < Vol DL',
       'Nb of sec with 1250B < Vol UL < 6250B',
       'Nb of sec with 31250B < Vol DL < 125000B',
       'Nb of sec with 37500B < Vol UL',
       'Nb of sec with 6250B < Vol DL < 31250B',
       'Nb of sec with 6250B < Vol UL < 37500B',


In [40]:
# Find out the missing values
df.isna().sum()

In [None]:
# Generate descriptive statistics
df.describe()

In [None]:
#Returning a tuple with dimensions of the dataframe
print(df.shape)

## Task 2 sub tasks.

In [50]:
# Identifying top 10 handsets used by the customers.
top_10_handsets = df['Handset Type'].value_counts().head(10)
print("Top 10 Handsets:")
print(top_10_handsets)

Top 10 Handsets:
Handset Type
Huawei B528S-23A                19752
Apple iPhone 6S (A1688)          9419
Apple iPhone 6 (A1586)           9023
undefined                        8987
Apple iPhone 7 (A1778)           6326
Apple iPhone Se (A1723)          5187
Apple iPhone 8 (A1905)           4993
Apple iPhone Xr (A2105)          4568
Samsung Galaxy S8 (Sm-G950F)     4520
Apple iPhone X (A1901)           3813
Name: count, dtype: int64


In [52]:
#Identify the top 3 handset manufacturers by count
top_3_manufacturers = df['Handset Manufacturer'].value_counts().head(3)

print("Top 3 Handset Manufacturers:")
print(top_3_manufacturers)

Top 3 Handset Manufacturers:
Handset Manufacturer
Apple      59565
Samsung    40839
Huawei     34423
Name: count, dtype: int64


In [57]:
#Find out the top 5 handsets in each of the top three manufactures
top_3_manufacturers = df['Handset Manufacturer'].value_counts().head(3).index.tolist()
filtered_df = df[df['Handset Manufacturer'].isin(top_3_manufacturers)]
top_5_handsets_per_manufacturer = filtered_df.groupby('Handset Manufacturer')['Handset Type'].value_counts().groupby('Handset Manufacturer').head(5)

print("Top 5 Handsets per Top 3 of the Handset Manufacturers:")
print(top_5_handsets_per_manufacturer)

Top 5 Handsets per Top 3 of the Handset Manufacturers:
Handset Manufacturer  Handset Type                  
Apple                 Apple iPhone 6S (A1688)            9419
                      Apple iPhone 6 (A1586)             9023
                      Apple iPhone 7 (A1778)             6326
                      Apple iPhone Se (A1723)            5187
                      Apple iPhone 8 (A1905)             4993
Huawei                Huawei B528S-23A                  19752
                      Huawei E5180                       2079
                      Huawei P20 Lite Huawei Nova 3E     2021
                      Huawei P20                         1480
                      Huawei Y6 2018                      997
Samsung               Samsung Galaxy S8 (Sm-G950F)       4520
                      Samsung Galaxy A5 Sm-A520F         3724
                      Samsung Galaxy J5 (Sm-J530)        3696
                      Samsung Galaxy J3 (Sm-J330)        3484
                      Sa

In [58]:
#Find out the bottom 5 handsets in each of the bottom three manufactures
bottom_3_manufacturers = df['Handset Manufacturer'].value_counts().tail(3).index.tolist()
filtered_df = df[df['Handset Manufacturer'].isin(bottom_3_manufacturers)]
bottom_5_handsets_per_manufacturer = filtered_df.groupby('Handset Manufacturer')['Handset Type'].value_counts().groupby('Handset Manufacturer').tail(5)

print("Botttom 5 Handsets per bottom 3 of the Handset Manufacturers:")
print(bottom_5_handsets_per_manufacturer)

Botttom 5 Handsets per bottom 3 of the Handset Manufacturers:
Handset Manufacturer              Handset Type                    
Mobiwire Sas                      Mobiwire Sas Mobiwire Auriga        1
Shenzhen Gaoxinqi                 Shenzhen Gaoxinq. Gaoxinqi Et110    1
Vastking Technology (Hk) Limited  Vastking Technol. Vastking M750     1
Name: count, dtype: int64


In [None]:
# Interpretation:
# The most used handset is Huawei B528S-23A.  It is also the only handset from the Huawei Manufacturer that appears in the top 10 most used handsets.
# I phone handsets seem to have the highest number of users as they take up 7 positions in top 10 mosted used handsets. 
# The top three handset manufacturers are Apple, Samsung, huawei respectively with each producing over 34400 handsets.
# In each the top 3 manufacturers, the leading handset also appears in the top 10 of the most used handsets. Also, all the top 5 handsets in the manufacturer, Apple, also 
# appear in the top 10 most used handsets overall.

# Recommendations:
# The marketing team should consider partnering with the handset manufactures, in a mutually benefitting way, in order to give promotions to customers using the top 10 handsets to enhance customer retention.
# They could also do the same for the least used handsets in order to broaden their customer base. 


# Task 2.1
## Overview of the users behaviour.

In [60]:
# Select columns pertaining to applications and user data
user_app_columns = ['Bearer Id', 'Handset Manufacturer', 'Handset Type', 'Social Media DL (Bytes)',
                    'Social Media UL (Bytes)', 'Google DL (Bytes)', 'Google UL (Bytes)',
                    'Email DL (Bytes)', 'Email UL (Bytes)', 'Youtube DL (Bytes)', 'Youtube UL (Bytes)',
                    'Netflix DL (Bytes)', 'Netflix UL (Bytes)', 'Gaming DL (Bytes)', 'Gaming UL (Bytes)',
                    'Other DL (Bytes)', 'Other UL (Bytes)']

#   Create a dataframe with a subset of columns from the original DataFrame 'df'
df_user_apps = df[user_app_columns]

In [None]:
# To count the number of xDR sessions per user

df_user_apps['Number of xDR Sessions'] = df_user_apps.groupby('Bearer Id')['Bearer Id'].transform('count')

In [None]:
# Extracting the relevant columns for the result
result_columns = ['Bearer Id', 'Handset Manufacturer', 'Handset Type', 'Number of xDR Sessions']

result = df_user_apps[result_columns].drop_duplicates().reset_index(drop=True)
print(result)