# User Enagagement

In [1]:
import pickle
import numpy as np
import pandas as pd
import math
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, normalize
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [2]:
df = pd.read_csv("C:\\Users\\dell\\Desktop\\Week1_challenge_data_sourcex.csv")

#### Tracking the user-engagement 
    
       grouping the customers based on the metrics of session frequency, the duration of session, and total traffic to track          the user activity on the network

In [3]:
df['total_dv'] = df['totalUL'] + df['totalDL']
user_engagement_df = df[['msisdn', 'Bearer Id', 'dur', 'total_dv']]

user_engagement_df = user_engagement_df.groupby(
    'msisdn').agg({'Bearer Id': 'count', 'dur': 'sum', 'total_dv': 'sum'})
user_engagement_df = user_engagement_df.rename(
    columns={'Bearer Id': 'XDR Sessions'})
user_engagement_df.head(10)

Unnamed: 0_level_0,XDR Sessions,dur,total_dv
msisdn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
33601000000.0,1,116720.0,878690600.0
33601000000.0,1,181230.0,156859600.0
33601000000.0,1,134969.0,595966500.0
33601010000.0,1,49878.0,422320700.0
33601010000.0,2,37104.0,1457411000.0
33601010000.0,2,253983.0,615217200.0
33601010000.0,2,128360.0,654723100.0
33601010000.0,1,86399.0,332660400.0
33601010000.0,2,495702.0,990132200.0
33601020000.0,1,124854.0,732463800.0


#### The top 10 Customers based on Engagement Metric

    we have aggregated the above metrics(frequency, duration and total data volume) per customer id(mssidn). and this below are the top 10 customers that are the most engaged.

In [4]:
user_engagement_df.nlargest(10, 'XDR Sessions')

Unnamed: 0_level_0,XDR Sessions,dur,total_dv
msisdn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
33626320000.0,18,8791927.0,7971167000.0
33614890000.0,17,9966898.0,8846226000.0
33625780000.0,17,18553754.0,8499621000.0
33659730000.0,16,4035428.0,7705863000.0
33675880000.0,15,4865947.0,7891111000.0
33760540000.0,15,9279434.0,8514774000.0
33667160000.0,13,8744914.0,5618394000.0
33603130000.0,12,6287761.0,4976195000.0
33604520000.0,12,5207990.0,5487855000.0
33627080000.0,12,4703516.0,5754731000.0


In [5]:
# top 10 customers per total data traffic
user_engagement_df.nlargest(10, 'total_dv')

Unnamed: 0_level_0,XDR Sessions,dur,total_dv
msisdn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
33614890000.0,17,9966898.0,8846226000.0
33760540000.0,15,9279434.0,8514774000.0
33625780000.0,17,18553754.0,8499621000.0
33626320000.0,18,8791927.0,7971167000.0
33675880000.0,15,4865947.0,7891111000.0
33659730000.0,16,4035428.0,7705863000.0
33666460000.0,11,4536757.0,7308501000.0
33760410000.0,12,5321667.0,7132371000.0
33664710000.0,11,2927785.0,6872018000.0
33698790000.0,11,5169128.0,6540899000.0


In [6]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(user_engagement_df)
scaled_data

array([[-0.4812899 , -0.15801408,  0.38229732],
       [-0.4812899 ,  0.18814798, -1.08766633],
       [-0.4812899 , -0.06008955, -0.19345265],
       ...,
       [-0.4812899 ,  3.92373087, -0.93439522],
       [-0.4812899 ,  0.57342835, -0.19279826],
       [-0.4812899 ,  3.8832657 , -1.12372349]])

In [7]:
normalized_data = normalize(scaled_data)
normalized_data

array([[-0.75837458, -0.24898479,  0.60239071],
       [-0.39968156,  0.15624529, -0.90323976],
       [-0.92168892, -0.11507383, -0.37046937],
       ...,
       [-0.11848395,  0.96594411, -0.23002943],
       [-0.62257276,  0.74175848, -0.24939427],
       [-0.11822006,  0.95385314, -0.27602211]])

In [8]:
kmeans = KMeans(n_clusters=3, random_state=1).fit(normalized_data)
kmeans.labels_

array([2, 1, 1, ..., 0, 1, 0])

In [9]:
user_engagement_df.insert(0, 'cluster', kmeans.labels_)
user_engagement_df

Unnamed: 0_level_0,cluster,XDR Sessions,dur,total_dv
msisdn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3.360100e+10,2,1,116720.0,8.786906e+08
3.360100e+10,1,1,181230.0,1.568596e+08
3.360100e+10,1,1,134969.0,5.959665e+08
3.360101e+10,1,1,49878.0,4.223207e+08
3.360101e+10,0,2,37104.0,1.457411e+09
...,...,...,...,...
3.379000e+10,2,1,8810.0,7.146416e+08
3.379000e+10,1,1,140988.0,4.803073e+08
3.197021e+12,0,1,877385.0,2.321240e+08
3.370000e+14,1,1,253030.0,5.962878e+08


In [10]:
user_engagement_df['cluster'].value_counts()

1    47883
2    29691
0    29282
Name: cluster, dtype: int64

In [21]:
fig = px.scatter(user_engagement_df.sample(5000), x='total_dv', y='dur',
                 color='cluster', size='XDR Sessions')
fig.update_traces(marker_size=8)
fig.show()
