In [None]:
import sys
# adding to the path variables the one folder higher (locally, not changing system variables)
sys.path.append("..")
import pandas as pd
import numpy as np
import warnings
import mlflow
from modeling.config import TRACKING_URI, EXPERIMENT_NAME
from pandas_profiling import ProfileReport
# import torch

# Read json files

In [None]:
alert_df=pd.read_json("../data/alerts_cleaned.json")

In [None]:
corazon_df= pd.read_json("../data/notification-labels-to-alert-surrogate-ids.json")

In [None]:
notification_df=pd.read_json("../data/notifications.json")

# Data Frame Cleaning

## Merging alert df with agency dummy variables

In [None]:
alert2= alert_df[['surrogate_id','agency']].set_index('surrogate_id')
alert2= pd.get_dummies(alert2)
grouped_alert= alert2.groupby(by='surrogate_id').sum()
alert_df.drop(['agency'], axis=1, inplace=True)
alert_df.drop_duplicates(subset='surrogate_id',inplace=True,ignore_index=True)
alert_df= pd.merge(alert_df,grouped_alert, on='surrogate_id')

## Merging notification df with corazon df

In [None]:
notification_df.drop_duplicates(inplace=True,ignore_index=True)
corazon_df.drop(['day'], axis=1, inplace=True)
corazon_df.drop_duplicates(inplace=True, ignore_index=True)
real_merge= notification_df.merge(corazon_df, how="left", left_on="join_key_value", right_on="notification_label_id")

## Merging all df together

In [None]:
real_merge.shape

In [None]:
#Queried df for proper merging
label_merger= real_merge.query("join_field == 'label'")
alert_id_merger= real_merge.query("join_field == 'alertId'")

In [None]:
clean_df=label_merger.merge(alert_df, how="left",left_on="corazon_surrogate_id", right_on="surrogate_id")

In [None]:
clean_df_3=alert_id_merger.merge(alert_df, how="left",left_on="join_key_value", right_on="document_id")

In [None]:
clean_df_2=clean_df[clean_df['description'].isnull()]
clean_df.dropna(subset=['description'],inplace=True)
clean_df_2=clean_df_2[["event_date","event_timestamp", "event_name","user_id","join_field","join_key_value"]]
clean_df_4=clean_df_2.merge(alert_df, how="left",left_on="join_key_value", right_on="document_id")
clean_df_4.dropna(subset=['created_at'],inplace=True)

In [None]:
clean_df_3.dropna(subset=['description'],inplace=True)

In [None]:
almost_finished_df= pd.concat([clean_df,clean_df_4], axis=0)
finished_df= pd.concat([almost_finished_df,clean_df_3], axis=0)

## Dropping irrelevant columns

In [None]:
export_df=finished_df.drop(['event_date','join_field','join_key_value','is_global','corazon_surrogate_id','notification_label_id'], axis=1)

## Fixing the event_name column:

In [None]:
export_df["event_name"].replace("notification_receive","notification_received", inplace=True)

In [None]:
export_df["event_name"].replace("notification_open","notification_opened", inplace=True)

## Feature engineering

In [None]:
export_df["event_timestamp"]=pd.to_datetime(export_df["event_timestamp"]*1000, unit="ns")

In [None]:
export_df["notif_viewed_ontime"]=list(map(lambda x,y,z: np.nan if x=="notification_received" else (1 if y < z else 0), export_df["event_name"], export_df["event_timestamp"],export_df["closed_at"])) #1 if the user view it on time  0 if not  and null if hasnt yet seen it 

In [None]:
export_df.dropna (subset=['notif_viewed_ontime'], inplace=True)
export_df.reset_index(drop=True, inplace=True)
export_df["notif_viewed_ontime"]=export_df["notif_viewed_ontime"].astype(int)

In [None]:
export_df["reaction_time"]=list(map(lambda x,y,z,w: float(pd.Timedelta(y - z).seconds/60)
if (x==1) else float(pd.Timedelta(y - w).seconds/60), 
export_df["notif_viewed_ontime"],export_df["event_timestamp"],export_df["created_at"],export_df["closed_at"]))

In [None]:
export_df.loc[export_df['event_name'].isin(['notification_opened', 'notification_view_alternatives' ,  'notification_share']),'opened']= 1
export_df.loc[export_df['event_name'].isin(['notification_received', 'notification_dismiss']),'opened']= 0
export_df['opened']=export_df['opened'].astype(int)

## Exporting the df to a json file, so it is easier to deal without the cleaning part.

In [None]:
export_df.reset_index(inplace=True, drop=True)

In [None]:
export_df.to_json("../data/cleaned_data.json",orient='columns')

In [None]:
!pip3 install pyarrow
import pyarrow.feather as feather
feather.write_feather(export_df, "../data/cleaned_data.feather")

In [None]:
export_df = feather.read_feather("../data/cleaned_data.feather")