# Task 1

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sqlalchemy import create_engine
from dotenv import load_dotenv
import os
import sys

# Load environment variables from .env file

In [5]:
load_dotenv('../venv/venv')

True

# Retrieve database connection 

In [6]:
db_user = os.getenv('DB_USER')
db_password = os.getenv('DB_PASSWORD')
db_host = os.getenv('DB_HOST')
db_port = os.getenv('DB_PORT')
db_name = os.getenv('DB_NAME')


# Create the connection string

In [None]:
def import_dbapi(cls):
 import psycopg2
 return psycopg2
from sqlalchemy import create_engine

connection_string = f'postgresql+psycopg2://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}'
engine = create_engine(connection_string)


# Query the data

In [None]:
query = 'SELECT * FROM xdr_data'
data = pd.read_sql(query, engine)

# Display the data

In [None]:
print(data.head())

In [None]:
print(data.info())

# Missing Values

In [None]:
missing_values = data.isnull().sum()
print(missing_values)

# Handling Missing Values

In [None]:
numeric_columns = data.select_dtypes(include=['float64']).columns
text_columns = data.select_dtypes(include=['object']).columns

data_cleaned = data.copy()

data_cleaned[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].mean())
data_cleaned[text_columns] = data[text_columns].fillna('N/A')

# Remove rows with empty 'MSISDN/Number'
data_cleaned = data_cleaned.dropna(subset=['MSISDN/Number'])

display(data_cleaned)

In [None]:
data_cleaned.info()

# Add the cleaned data to xdr_data

In [None]:
data_cleaned.to_sql(table_name, engine, if_exists='replace', index=False)

In [None]:
query = 'SELECT * FROM xdr_data'
data = pd.read_sql(query, engine)
print(data.info())

# Top 10 Handsets Used by Customers

In [None]:
top_10_handsets = data['Handset Type'].value_counts().head(10)
print("Top 10 Handsets:")
print(top_10_handsets)

# Top 3 Handset Manufacturers

In [None]:
top_3_manufacturers = data['Handset Manufacturer'].value_counts().head(3)
print("Top 3 Handset Manufacturers:")
print(top_3_manufacturers)

# Top 5 Handsets Per Top 3 Manufacturers

In [None]:
for manufacturer in top_3_manufacturers.index:
    top_5_handsets = data[data['Handset Manufacturer'] == manufacturer]['Handset Type'].value_counts().head(5)
    print(f"\nTop 5 Handsets for Manufacturer {manufacturer}:")
    print(top_5_handsets)

# Group by each user (assuming 'MSISDN/Number' is the identifier for users)

In [None]:
# Define DL and UL columns
dl_columns = [
    'Social Media DL (Bytes)', 
    'Google DL (Bytes)', 
    'Email DL (Bytes)', 
    'Youtube DL (Bytes)', 
    'Netflix DL (Bytes)', 
    'Gaming DL (Bytes)', 
    'Other DL (Bytes)'
]

ul_columns = [
    'Social Media UL (Bytes)', 
    'Google UL (Bytes)', 
    'Email UL (Bytes)', 
    'Youtube UL (Bytes)', 
    'Netflix UL (Bytes)', 
    'Gaming UL (Bytes)', 
    'Other UL (Bytes)'
]

# Group by each user using 'MSISDN/Number' as the identifier
user_overview = data.groupby('MSISDN/Number').agg(
    xdr_sessions=('Dur. (ms)', 'count'),  # Number of xDR sessions
    total_duration=('Dur. (ms)', 'sum'),  # Total session duration
    **{col: (col, 'sum') for col in dl_columns},  # Sum each DL column
    **{col: (col, 'sum') for col in ul_columns}   # Sum each UL column
).reset_index()

# Sum the DL and UL columns across each group
user_overview['total_dl_data'] = user_overview[dl_columns].sum(axis=1)
user_overview['total_ul_data'] = user_overview[ul_columns].sum(axis=1)

# Calculate total data volume
user_overview['total_data_volume'] = user_overview[dl_columns].sum(axis=1) + user_overview[ul_columns].sum(axis=1)

# Drop the intermediary columns
user_overview = user_overview.drop(columns=dl_columns + ul_columns)

print(user_overview.head())

# Exploratory Data Analysis (EDA)

# Describe Variables

In [None]:
# Describe data
print(data.describe())

In [None]:
# Check data types
print(data.info())

# Variable Transformations

In [None]:
import pandas as pd

# Calculate total data (DL + UL)
data['total_data'] = data[
    ['Social Media DL (Bytes)', 'Google DL (Bytes)', 'Email DL (Bytes)', 
     'Youtube DL (Bytes)', 'Netflix DL (Bytes)', 'Gaming DL (Bytes)', 'Other DL (Bytes)',
     'Social Media UL (Bytes)', 'Google UL (Bytes)', 'Email UL (Bytes)', 
     'Youtube UL (Bytes)', 'Netflix UL (Bytes)', 'Gaming UL (Bytes)', 'Other UL (Bytes)']
].sum(axis=1)

# Segment into deciles based on total duration, dropping duplicate bin edges
data['duration_decile'] = pd.qcut(data['Dur. (ms)'], 10, labels=False, duplicates='drop')

# Compute total data per decile class
total_data_per_decile = data.groupby('duration_decile')['total_data'].sum()

print(total_data_per_decile)

# Basic Metrics

In [None]:
# Calculate mean, median, etc.
metrics = data[['Dur. (ms)', 'total_data']].agg(['mean', 'median', 'std'])
print(metrics)

# Non-Graphical Univariate Analysis

In [None]:
# Dispersion parameters
dispersion = data[['Dur. (ms)', 'total_data']].agg(['var', 'std'])
print(dispersion)

# Graphical Univariate Analysis

In [None]:
# Histograms
data[['Dur. (ms)', 'total_data']].hist(bins=30, figsize=(10, 5))
plt.show()

# Bivariate Analysis

In [None]:
# Scatter plot
sns.scatterplot(x='Dur. (ms)', y='total_data', data=data)
plt.show()

# Dimensionality Reduction

In [None]:
# PCA
pca = PCA(n_components=2)
principal_components = pca.fit_transform(data[['total_data', 'Dur. (ms)']])
plt.scatter(principal_components[:, 0], principal_components[:, 1])
plt.title('PCA of Total Data and Duration')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()