In [None]:
import sys
# adding to the path variables the one folder higher (locally, not changing system variables)
sys.path.append("..")
import pandas as pd
import numpy as np
import warnings
import mlflow
from modeling.config import TRACKING_URI, EXPERIMENT_NAME
from pandas_profiling import ProfileReport
import pyarrow.feather as feather
import time
from datetime import datetime
from datetime import timedelta
import seaborn as sns
from sklearn.cluster import AgglomerativeClustering
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import matplotlib.patches as mpatches



In [None]:
export_df = feather.read_feather("../data/cleaned_data.feather")



In [None]:
export_df["event_timestamp"]=pd.to_datetime(export_df["event_timestamp"]*1000, unit="ns")



In [None]:
export_df["notif_viewed_ontime"]=list(map(lambda x,y,z: np.nan if x=="notification_received" else (1 if y < z else 0), export_df["event_name"], export_df["event_timestamp"],export_df["closed_at"])) #1 if the user view it on time  0 if not  and null if hasnt yet seen it 



In [None]:
export_df.dropna (subset=['notif_viewed_ontime'], inplace=True)
export_df.reset_index(drop=True, inplace=True)
export_df["notif_viewed_ontime"]=export_df["notif_viewed_ontime"].astype(int)



In [None]:
export_df["reaction_time"]=list(map(lambda x,y,z,w: float(pd.Timedelta(y - z).seconds/60)
if (x==1) else float(pd.Timedelta(y - w).seconds/60), 
export_df["notif_viewed_ontime"],export_df["event_timestamp"],export_df["created_at"],export_df["closed_at"]))



In [None]:
selected_columns = export_df[["event_timestamp","reaction_time","event_name","created_at","notif_viewed_ontime"]]
day_analisis = selected_columns.copy()



In [None]:
# IMPORTANT: REACTION TIME FOR THE CASES OF NOTIF VIEWED ONTIME = 0 IS THE DIFFERENCE BETWEEN INCIDENT CLOSED AND INCIDENT_INTERACTION
# NOTIF VIEWED ONTIME= 1 IS DIFF BETWEEN INCIDENT CREATED AND INCIDENT INTERACTION



In [None]:
export_df.loc[export_df['event_name'].isin(['notification_opened', 'notification_view_alternatives' ,  'notification_share']),'opened']= 1
export_df.loc[export_df['event_name'].isin(['notification_received', 'notification_dismiss']),'opened']= 0
export_df['opened']=export_df['opened'].astype(int)



In [None]:
gpbyuser= export_df.groupby( by= ['user_id'])
gpbyuser2=gpbyuser.sum()
gpbyuser2['count1'] = gpbyuser.size()
gpbyuser2['opened_rate'] = gpbyuser2['opened']/gpbyuser2['count1']
gpbyuser2['ontime_activity_rate'] = gpbyuser2["notif_viewed_ontime"]/gpbyuser2['count1']
gpbyuser2.drop(gpbyuser2[gpbyuser2['count1'] <3 ].index, inplace=True)
real_users=gpbyuser2.index.tolist()
export_df=export_df[export_df['user_id'].isin(real_users)]



In [None]:
sns.countplot(data=export_df, x="event_name")


 # Analyzing agencies

In [None]:
#Deeper understanding on agencies. Choosing aggfunc sum, as we want to know how many users are affected by their notifications. Observing values for both active and unactive users, it does not matter, as this is addressed in another part.
agencies_by_cause = pd.pivot_table(export_df, values=['agency_GewRJAw5tUmC4Ku4AX1-SQ', 'agency_GtvOEQAFZ0GtU6u4AXwvPg','agency_HE59N3RXM0q5vKu4AXlQZg','agency_JUR9bFXmVkWDHqu4AXaY0g', 'agency_JfA8Bw8Zp024Kqu4AXiSpQ','agency_MgUq5b9mOEunx6u4AXt_BA','agency_NuuRQ2I1Q0a50Kv-AVKlLA', 'agency_V2AIQQKgmUO3VazvAOA-Cw','agency_jLjibFoim0iwWau4AWoEdQ', 'agency_pky7jovXYkaw-awAAMrQ3g',
'agency_zCy9zG00HEqGeKu4AWZYNQ'], index=['cause'], aggfunc=np.sum, fill_value=0, margins=True, margins_name= 'Total')
agencies_by_effect = pd.pivot_table(export_df, values=['agency_GewRJAw5tUmC4Ku4AX1-SQ', 'agency_GtvOEQAFZ0GtU6u4AXwvPg','agency_HE59N3RXM0q5vKu4AXlQZg','agency_JUR9bFXmVkWDHqu4AXaY0g', 'agency_JfA8Bw8Zp024Kqu4AXiSpQ','agency_MgUq5b9mOEunx6u4AXt_BA','agency_NuuRQ2I1Q0a50Kv-AVKlLA', 'agency_V2AIQQKgmUO3VazvAOA-Cw','agency_jLjibFoim0iwWau4AWoEdQ', 'agency_pky7jovXYkaw-awAAMrQ3g',
'agency_zCy9zG00HEqGeKu4AWZYNQ'], index=['effect'], aggfunc=np.sum, fill_value=0, margins=True, margins_name= 'Total')



In [None]:
#Agencies MgUq5b9mOEunx6u4AXt_BA and V2AIQQKgmUO3VazvAOA-Cw created no incidents, dropping them
agencies_by_effect.drop(["agency_MgUq5b9mOEunx6u4AXt_BA","agency_V2AIQQKgmUO3VazvAOA-Cw"],axis=1, inplace=True)
agencies_by_cause.drop(["agency_MgUq5b9mOEunx6u4AXt_BA","agency_V2AIQQKgmUO3VazvAOA-Cw"],axis=1, inplace=True)


 # Unactive users

In [None]:
export_df["notif_viewed_ontime"].value_counts()



In [None]:
a = 524520/(526984+380595)
a


 57,79% of notifications are viewed ontime.

In [None]:
sns.scatterplot(data=gpbyuser2, y="opened", x="ontime_activity_rate")



In [None]:
sns.scatterplot(data=gpbyuser2, x="opened_rate", y="notif_viewed_ontime")



In [None]:
gpbyuser2["notif_viewed_notontime"]=gpbyuser2["count1"] - gpbyuser2["notif_viewed_ontime"] 
sns.scatterplot(data=gpbyuser2, x="opened_rate", y="notif_viewed_notontime")



In [None]:
#sns.histplot(data=active_df[["reaction_time","notif_viewed_ontime","event_name"]], x= "reaction_time",y= "notif_viewed_ontime", hue="event_name")



In [None]:
sns.scatterplot(data=gpbyuser2[["reaction_time","opened_rate"]], x= "reaction_time",y="opened_rate")



In [None]:
sns.scatterplot(data=gpbyuser2[["reaction_time","opened_rate"]], x= "reaction_time",y="opened_rate")


 Checking on incidents reacted late on

In [None]:
export_df.columns



In [None]:
reacted_late_incidents=export_df[export_df["notif_viewed_ontime"]==0]
sns.countplot(data=reacted_late_incidents, x="event_name")


 # Identifying customers

In [None]:
agg_clust = AgglomerativeClustering(n_clusters=3, linkage='ward').fit(gpbyuser2[["opened_rate", "ontime_activity_rate"]])



In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
ax=sns.scatterplot(x=gpbyuser2["opened_rate"],y=gpbyuser2["ontime_activity_rate"], hue=agg_clust.labels_, palette="deep")
handles, labels  =  ax.get_legend_handles_labels()
ax.legend(handles, ["Scared users","Dismissers","Normal users"],loc='upper center', title='User classification')



In [None]:
export_df.info()


 # Identifying the time behaviour daily

 Excluding people who check their phone days later

In [None]:
day_analisis.head()



In [None]:
day_analisis['time_hour'] = [row.hour for row in day_analisis['event_timestamp']]



In [None]:
sns.scatterplot(data=day_analisis ,y="reaction_time",x="time_hour",hue="event_name")


