In [None]:
import sys
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
sys.path.append(os.path.abspath('../src'))

In [None]:
from load_data import load_data_using_sqlalchemy

In [None]:

# Define your SQL query
query = "SELECT * FROM clenedxdr_data;"  

# Load data from PostgreSQL using SQLAlchemy
df = load_data_using_sqlalchemy(query)

# Display the first few rows of the dataframe
if df is not None:
    print("Successfully loaded the data")
else:
    print("Failed to load data.")

grouping data by mssisdn and aggregating the data

In [None]:
grouped_df = df.groupby('MSISDN/Number').agg({
    'Dur. (s)': 'sum',
    'Total DL (Megabytes)': 'sum',
    'Total UL (Megabytes)': 'sum',
    'Activity Duration DL (s)': 'sum',
    'Activity Duration UL (s)': 'sum'
}).reset_index()

In [None]:
grouped_df['Total Traffic (Megabytes)'] = grouped_df['Total DL (Megabytes)'] + grouped_df['Total UL (Megabytes)']

In [None]:
#Sorting and Reporting Top 10 Customers
top_10_duration = grouped_df.sort_values(by='Dur. (s)', ascending=False).head(10)
top_10_download = grouped_df.sort_values(by='Total DL (Megabytes)', ascending=False).head(10)
top_10_upload = grouped_df.sort_values(by='Total UL (Megabytes)', ascending=False).head(10)

In [None]:
print("Top 10 customers by session duration:")
print(top_10_duration)

In [None]:
print("Top 10 customers by total download traffic:")
top_10_download.head(10)

In [None]:
print("Top 10 customers by total upload traffic:")
top_10_upload.head(10)

In [None]:

# Normalization 
columns_to_normalize = ['Dur. (s)', 'Total DL (Megabytes)', 'Total UL (Megabytes)', 'Session Frequency']
scaler = MinMaxScaler()
# Fit and transform the selected columns
grouped_df[columns_to_normalize] = scaler.fit_transform(grouped_df[columns_to_normalize])

In [None]:
# K-Means Clustering with k=3

kmeans = KMeans(n_clusters=3, random_state=42)
grouped_df['Cluster'] = kmeans.fit_predict(grouped_df[columns_to_normalize])

In [None]:
#cluster centers to see the characteristics of each cluster
print("Cluster Centers (Centroids):")
print(kmeans.cluster_centers_)

Visualization Total Traffic vs Total Duration

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(grouped_df['Dur. (s)'], grouped_df['Total Traffic (Megabytes)'], c=grouped_df['Cluster'], cmap='viridis')
plt.title("Customer Engagement Clusters (k=3)")
plt.xlabel("Normalized Session Duration (s)")
plt.ylabel("Normalized Total Traffic (Megabytes)")
plt.colorbar(label='Cluster')
plt.show()

Visualization Total Traffic vs Total Session Frequency

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(grouped_df['Session Frequency'], grouped_df['Total DL (Megabytes)'] + grouped_df['Total UL (Megabytes)'], 
            c=grouped_df['Cluster'], cmap='viridis')
plt.title("Customer Engagement Clusters (k=3)")
plt.xlabel("Normalized Session Frequency")
plt.ylabel("Normalized Total Traffic (Megabytes)")
plt.colorbar(label='Cluster')
plt.show()

minimum, maximum, average & total non-normalized metrics for each cluster

In [None]:
cluster = grouped_df.groupby('Cluster').agg({
    'Dur. (s)': ['min', 'max', 'mean', 'sum'],
    'Total DL (Megabytes)': ['min', 'max', 'mean', 'sum'],
    'Total UL (Megabytes)': ['min', 'max', 'mean', 'sum'],
    'Session Frequency': ['min', 'max', 'mean', 'sum']
}).reset_index()

In [None]:
# Plotting average total download traffic per cluster
plt.figure(figsize=(10, 6))
plt.bar(cluster_stats['Cluster'], cluster_stats['Total DL (Megabytes)']['mean'], color=['skyblue', 'orange', 'green'])
plt.title("Average Total Download Traffic per Cluster")
plt.xlabel("Cluster")
plt.ylabel("Average Total Download Traffic (Megabytes)")
plt.show()

In [None]:

plt.figure(figsize=(10, 6))
plt.bar(cluster_stats['Cluster'], cluster_stats['Dur. (s)']['mean'], color=['skyblue', 'orange', 'green'])
plt.title("Average Session Duration per Cluster")
plt.xlabel("Cluster")
plt.ylabel("Average Session Duration (seconds)")
plt.show()

Aggregation of user total traffic per application

In [None]:
applications = {
    'Social Media': ('Social Media DL (Megabytes)', 'Social Media UL (Megabytes)'),
    'Youtube': ('Youtube DL (Megabytes)', 'Youtube UL (Megabytes)'),
    'Netflix': ('Netflix DL (Megabytes)', 'Netflix UL (Megabytes)'),
    'Google': ('Google DL (Megabytes)', 'Google UL (Megabytes)'),
    'Email': ('Email DL (Megabytes)', 'Email UL (Megabytes)'),
    'Gaming': ('Gaming DL (Megabytes)', 'Gaming UL (Megabytes)'),
    'Other': ('Other DL (Megabytes)', 'Other UL (Megabytes)')
}

In [None]:
top_users_per_app = {}

In [None]:
for app, (dl_col, ul_col) in applications.items():
    print(f"\nProcessing {app}...")

    # Check if the columns exist in the DataFrame
    if dl_col in df.columns and ul_col in df.columns:
        # Aggregate total traffic (DL + UL) per user
        total_traffic_per_user = (
            df.groupby('MSISDN/Number')[[dl_col, ul_col]]
            .sum()
            .sum(axis=1)  # Sum DL and UL
            .reset_index(name='Total Traffic')
            .sort_values(by='Total Traffic', ascending=False)
            .head(10)
        )
        
        # Store the top 10 users for the current application
        top_users_per_app[app] = total_traffic_per_user
        
        # Print the top 10 users
        print(f"Top 10 Most Engaged Users for {app}:")
        print(total_traffic_per_user)
        print("\n")
    else:
        print(f"Columns '{dl_col}' or '{ul_col}' not found in DataFrame.")
        print("\n")

In [None]:

total_traffic_per_app = {}

In [None]:
# Plotting the top 3 most used applications

# Calculate total traffic for each application
for app, (dl_col, ul_col) in applications.items():
    if dl_col in df.columns and ul_col in df.columns:
        # Aggregate total traffic (DL + UL) per user
        total_traffic = (
            df[[dl_col, ul_col]]
            .sum()  # Sum DL and UL for all users
        ).sum()  # Sum across all applications
        
        # Store the total traffic for the current application
        total_traffic_per_app[app] = total_traffic

In [None]:
# Convert the dictionary to a DataFrame for easier plotting
traffic_df = pd.DataFrame(list(total_traffic_per_app.items()), columns=['Application', 'Total Traffic'])

# Sort applications by total traffic and get the top 3
top_3_apps = traffic_df.sort_values(by='Total Traffic', ascending=False).head(3)

In [None]:
# Plot the top 3 most used applications with a logarithmic scale
plt.figure(figsize=(10, 6))
plt.bar(top_3_apps['Application'], top_3_apps['Total Traffic'], color=['blue', 'green', 'red'])
plt.yscale('log')  # Set y-axis to logarithmic scale
plt.xlabel('Application')
plt.ylabel('Total Traffic (Megabytes)')
plt.title('Top 3 Most Used Applications (Log Scale)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# K-Means Clustering and Find Optimal k
metrics = df[['Dur. (s)', 'Activity Duration DL (s)', 'Activity Duration UL (s)',
                              'Total DL (Megabytes)', 'Total UL (Megabytes)',
                              'Social Media DL (Megabytes)', 'Social Media UL (Megabytes)',
                              'Youtube DL (Megabytes)', 'Youtube UL (Megabytes)',
                              'Netflix DL (Megabytes)', 'Netflix UL (Megabytes)',
                              'Google DL (Megabytes)', 'Google UL (Megabytes)',
                              'Email DL (Megabytes)', 'Email UL (Megabytes)',
                              'Gaming DL (Megabytes)', 'Gaming UL (Megabytes)',
                              'Other DL (Megabytes)', 'Other UL (Megabytes)']]

In [None]:
# Normalizing the data
scaler = StandardScaler()
scaled_metrics = scaler.fit_transform(metrics)

# Elbow Method to find the optimal number of clusters
wcss = []
k_range = range(1, 11)  # Trying k from 1 to 10

In [None]:
for i in k_range:
    kmeans = KMeans(n_clusters=i, random_state=0)
    kmeans.fit(scaled_metrics)
    wcss.append(kmeans.inertia_)

In [None]:
# Plotting WCSS to find the elbow
plt.figure(figsize=(10, 6))
plt.plot(k_range, wcss, marker='o')
plt.xlabel('Number of clusters (k)')
plt.ylabel('WCSS')
plt.title('Elbow Method for Optimal k')
plt.xticks(k_range)
plt.grid(True)
plt.show()

In [None]:
optimal_k = 4

In [None]:
kmeans = KMeans(n_clusters=optimal_k, random_state=0)
df['Cluster'] = kmeans.fit_predict(scaled_metrics)

In [None]:
# Analyze and interpret the results
cluster_summary = df.groupby('Cluster').mean()

In [None]:
print("Cluster Summary:")
cluster_summary