In [2]:
import pandas as pd
import numpy as np
import plotly as px
import plotly.express as px
from typing import List, Tuple, Dict, Any, Optional
import seaborn as sns
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
import os
import plotly.graph_objects as go
from scipy.spatial import Delaunay

SESSION_TIME_BINS = [
    30,
    35,
    40,
    45,
    50,
    60,
    70,
    90,
    120,
    180,
    1_000_000
]

USER_SESSION_BINS = [i for i in range(1, 21)] + [10000, 1_000_000]

MODEL_LIST = [
    'LSTM SEQ 1',
    'LSTM SEQ 10',
    'LSTM SEQ 20',
    'LSTM SEQ 30',
    'LSTM SEQ 40'
]

In [3]:
CORE_PATH = 'auroc_eval_data'

In [4]:
def generate_sess(df, models, func, target):
    print('Identifying max session time')
    max_session_time = df.groupby(['user_id', 'session_30_raw'])['cum_session_time_raw'].max().reset_index().rename(columns={'cum_session_time_raw': 'max_session_time'})
    df = df.merge(max_session_time, on=['user_id', 'session_30_raw'], how='left')

    print('Binning sessions')
    df['session_bin'] = pd.cut(df['max_session_time'], bins=SESSION_TIME_BINS, labels=SESSION_TIME_BINS[:-1])
    df_session_time = df[
        ['label', 'session_bin', 'experiment'] + \
        models
    ]
    print('Sessions binned: performing melt')
    df_session_time_melted = pd.melt(
        df_session_time,
        id_vars=['label', 'session_bin', 'experiment'],
    )
    
    print('Sessions melted: dropping na')
    df_session_time_melted = df_session_time_melted.dropna()
    df_session_time_melted = df_session_time_melted.rename(columns={'variable': 'model'})
    if target != 'auroc':
        print('Converting to binary')
        df_session_time_melted['value'] = df_session_time_melted['value'].apply(lambda x: 1 if x > 0.5 else 0)
        
    print('Grouping by session bin, model, and experiment')
    if target == 'auroc':
        print('Performing auroc')
        df_session_time_melted = df_session_time_melted.groupby(['session_bin', 'model', 'experiment']).apply(lambda x: func(x['label'], x['value'])).reset_index()
    else:
        print(f'Performing {target}: zero division set to 1')
        df_session_time_melted = df_session_time_melted.groupby(['session_bin', 'model', 'experiment']).apply(lambda x: func(x['label'], x['value'], zero_division=1)).reset_index()
    print('Performing final metadata operations')
    df_session_time_melted = df_session_time_melted.rename(columns={0: target})
    df_session_time_melted['Window'] = df_session_time_melted['model'].apply(lambda x: int(x.split(' ')[-1]))
    return df_session_time_melted

def generate_plat(df, models, func, target):
    
    print('Binning users based on platform time')
    df['max_sessions'] = df.groupby(['user_id'])['session_30_raw'].transform('max')

    df['user_bin'] = pd.cut(df['max_sessions'], bins=USER_SESSION_BINS, labels=USER_SESSION_BINS[:-1])
    df['user_bin'] = df['user_bin'].apply(lambda x: min(20, x))
    
    df_user_time = df[
        ['label', 'user_bin', 'experiment'] + \
        models
    ]
    
    print('Performing melt')
    df_user_time= df_user_time.melt(
        id_vars=['label', 'user_bin', 'experiment']
    )
    
    print('Dropping na and renaming')
    df_user_time = df_user_time.dropna()
    df_user_time = df_user_time.rename(columns={'variable': 'model'})
    
    if target != 'auroc':
        print('Converting to binary')
        df_user_time['value'] = df_user_time['value'].apply(lambda x: 1 if x > 0.5 else 0)
        
    print('Grouping by user bin, model, and experiment')
    if target == 'auroc':
        print('Performing auroc')
        df_user_time = df_user_time.groupby(['user_bin', 'model', 'experiment']).apply(lambda x: func(x['label'], x['value'])).reset_index()
    else:
        print(f'Performing {target}: zero division set to 1')
        df_user_time = df_user_time.groupby(['user_bin', 'model', 'experiment']).apply(lambda x: func(x['label'], x['value'], zero_division=1)).reset_index()
    print('Performing final metadata operations')
    df_user_time = df_user_time.rename(columns={0: target})
    df_user_time['Window'] = df_user_time['model'].apply(lambda x: int(x.split(' ')[-1]))
    return df_user_time

In [5]:
def plot_contours(df, xtitle, ytitle, ztitle, target_var, save_suffix, title): 
    x_var, y_var, z_var, target_var = df[xtitle], df[ytitle], df[ztitle], df[target_var]
    delaney_points = np.vstack([x_var, y_var]).T
    
    I, J, K = Delaunay(delaney_points).simplices.T
    fig = go.Figure(go.Mesh3d(x=x_var, y=y_var, z=z_var, i=I, j=J, k=K, intensity=target_var, colorscale='thermal', alphahull=.5))
    fig.update_layout(
        height=800,
        width=1200,
        showlegend=True,
        margin=dict(l=5, r=5, t=5, b=5),
        scene =dict(
        xaxis_title="Session Minutes",
        yaxis_title="Window Size",
        zaxis_title="Experiment",
    ))
    
    # fig.update_layout(title=title)
    fig.write_image(
        os.path.join(f'lstm_plots/metric_graphs/granular/{save_suffix}.png'), width=800, height=1200, scale=3
    )
    
    fig.show()
    

In [6]:
# df_auc_10 = pd.read_parquet(os.path.join(CORE_PATH, 'data_for_auc_window_10_eval'))
# df_auc_20 = pd.read_parquet(os.path.join(CORE_PATH, 'data_for_auc_window_20_eval'))
df_auc_30 = pd.read_parquet(os.path.join(CORE_PATH, 'data_for_auc_window_30_eval'))

# df_auc_10['experiment'] = 10
# df_auc_20['experiment'] = 20
# df_auc_30['experiment'] = 30

# total_auc_df = pd.concat([df_auc_10, df_auc_20, df_auc_30])

In [8]:
df_auc_30_user_session_max = df_auc_30.groupby(['user_id', 'session_30_raw'])['cum_session_time_raw'].max().reset_index()

In [10]:
df_auc_30_user_session_max[df_auc_30_user_session_max['cum_session_time_raw'] > 60].shape[0] / df_auc_30_user_session_max.shape[0]

0.1441476144489415

In [11]:
unique_users = df_auc_30_user_session_max['user_id'].unique()

In [12]:
df_auc_30_user_session_max[df_auc_30_user_session_max['cum_session_time_raw'] > 60]['user_id'].unique().shape[0] / unique_users.shape[0]

0.19013357524947602

In [6]:
# print('Generating session time auroc')
# eval_auc_sess_glob = generate_sess(total_auc_df, MODEL_LIST, roc_auc_score, 'auroc')
# print('Generating session time precision')
# eval_prec_sess_glob = generate_sess(total_auc_df, MODEL_LIST, precision_score, 'precision')
# print('Generating session time recall')
# eval_rec_sess_glob = generate_sess(total_auc_df, MODEL_LIST, recall_score, 'recall')



In [24]:
print('Generating platform time auroc')
eval_auc_plat_glob = generate_plat(total_auc_df, MODEL_LIST, roc_auc_score, 'auroc')
print('Generating platform time recall')
eval_rec_plat_glob = generate_plat(total_auc_df, MODEL_LIST, recall_score, 'recall')
print('Generating platform time precision')
eval_prec_plat_glob = generate_plat(total_auc_df, MODEL_LIST, precision_score, 'precision')

Generating platform time auroc
Binning users based on platform time
Performing melt
Dropping na and renaming
Grouping by user bin, model, and experiment
Performing auroc
Performing final metadata operations
Generating platform time recall
Binning users based on platform time
Performing melt
Dropping na and renaming
Converting to binary
Grouping by user bin, model, and experiment
Performing recall: zero division set to 1
Performing final metadata operations
Generating platform time precision
Binning users based on platform time
Performing melt
Dropping na and renaming
Converting to binary
Grouping by user bin, model, and experiment
Performing precision: zero division set to 1
Performing final metadata operations


In [41]:
# eval_auc_sess_glob.to_csv(os.path.join(CORE_PATH, 'eval_auc_sess_glob.csv'))
eval_auc_plat_glob.to_csv(os.path.join(CORE_PATH, 'eval_auc_plat_glob.csv'))
# eval_prec_sess_glob.to_csv(os.path.join(CORE_PATH, 'eval_prec_sess_glob.csv'))
eval_prec_plat_glob.to_csv(os.path.join(CORE_PATH, 'eval_prec_plat_glob.csv'))
# eval_rec_sess_glob.to_csv(os.path.join(CORE_PATH, 'eval_rec_sess_glob.csv'))
eval_rec_plat_glob.to_csv(os.path.join(CORE_PATH, 'eval_rec_plat_glob.csv'))

In [42]:
eval_auc_sess_glob = pd.read_csv(os.path.join(CORE_PATH, 'eval_auc_sess_glob.csv'))
eval_prec_sess_glob = pd.read_csv(os.path.join(CORE_PATH, 'eval_prec_sess_glob.csv'))
eval_rec_sess_glob = pd.read_csv(os.path.join(CORE_PATH, 'eval_rec_sess_glob.csv'))

In [43]:
plot_contours(
    eval_auc_sess_glob,
    'session_bin',
    'Window',
    'experiment',
    'auroc',
    'eval_auroc_sess_glob',
    'ROC | Session Time'
)

In [38]:
plot_contours(
    eval_auc_plat_glob,
    'user_bin',
    'Window',
    'experiment',
    'auroc',
    'eval_auroc_plat_glob',
    'ROC | Platform Time'
)

In [13]:
# plot_contours(
#     eval_prec_sess_glob,
#     'session_bin',
#     'Window',
#     'experiment',
#     'precision',
#     'eval_prec_sess_glob',
#     'Precision | Session Time'
# )

In [39]:
plot_contours(
    eval_prec_plat_glob,
    'user_bin',
    'Window',
    'experiment',
    'precision',
    'eval_prec_plat_glob',

    'Precision | Platform Time'
)

In [14]:
# plot_contours(
#     eval_rec_sess_glob,
#     'session_bin',
#     'Window',
#     'experiment',
#     'recall',
#     'eval_rec_sess_glob',
#     'Recall | Session Time'
# )


In [40]:
plot_contours(
    eval_rec_plat_glob,
    'user_bin',
    'Window',
    'experiment',
    'recall',
    'eval_rec_plat_glob',
    'Recall | Platform Time'
)

In [29]:

eval_auc_sess.to_csv('lstm_plots/metric_graphs/granular/eval_auc_sess.csv')
test_auc_sess.to_csv('lstm_plots/metric_graphs/granular/test_auc_sess.csv')
eval_auc_plat.to_csv('lstm_plots/metric_graphs/granular/eval_auc_plat.csv')
test_auc_plat.to_csv('lstm_plots/metric_graphs/granular/test_auc_plat.csv')

In [15]:
# plot_contours(
#     eval_auc_sess,
#     'session_bin',
#     'Window',
#     'experiment',
#     'auroc',
#     'eval_auroc_sess',
#    'Evaluation AUROC | Session Size'
# )


In [16]:
# plot_contours(
#     test_auc_sess,
#     'session_bin',
#     'Window',
#     'experiment',
#     'auroc',
#     'test_auroc_sess',
#     'Test AUROC | Session Size'
# )

In [19]:
# print('identifying evaluation precision session')
# eval_prec_sess = generate_sess(eval_auc_df, MODEL_LIST, precision_score, 'precision')
# print('identifying test precision session')
# test_prec_sess = generate_sess(test_auc_df, MODEL_LIST, precision_score, 'precision')

# print('identifying evaluation precision platform')
# eval_prec_plat = generate_plat(eval_auc_df, MODEL_LIST, precision_score, 'precision')
# print('identifying test precision platform')
# test_prec_plat = generate_plat(test_auc_df, MODEL_LIST, precision_score, 'precision')


identifying evaluation precision session
Identifying max session time
Binning sessions
Sessions binned: performing melt
Sessions melted: dropping na
Converting to binary
Grouping by session bin, model, and experiment
Performing precision: zero division set to 1
Performing final metadata operations
identifying test precision session
Identifying max session time
Binning sessions
Sessions binned: performing melt
Sessions melted: dropping na
Converting to binary
Grouping by session bin, model, and experiment
Performing precision: zero division set to 1
Performing final metadata operations
identifying evaluation precision platform
Binning users based on platform time
Performing melt
Dropping na and renaming
Converting to binary
Grouping by user bin, model, and experiment
Performing precision: zero division set to 1
Performing final metadata operations
identifying test precision platform
Binning users based on platform time
Performing melt
Dropping na and renaming
Converting to binary
Groupi

In [30]:
# eval_prec_sess.to_csv('lstm_plots/metric_graphs/granular/eval_prec_sess.csv')
# test_prec_sess.to_csv('lstm_plots/metric_graphs/granular/test_prec_sess.csv')
# eval_prec_plat.to_csv('lstm_plots/metric_graphs/granular/eval_prec_plat.csv')
# test_prec_plat.to_csv('lstm_plots/metric_graphs/granular/test_prec_plat.csv')

In [20]:
# plot_contours(
#     eval_prec_sess.copy(),
#     'session_bin',
#     'Window',
#     'experiment',
#     'precision',
#     'eval_prec_sess',
#     'Evaluation Precision | Session Size'
# )

In [21]:
# plot_contours(
#     test_prec_sess.copy(),
#     'session_bin',
#     'Window',
#     'experiment',
#     'precision',
#     'test_prec_sess',
#     'Test Precision | Session Size'
# )

In [24]:
# print('identifying evaluation recall session')
# eval_rec_sess = generate_sess(eval_auc_df, MODEL_LIST, recall_score, 'recall')
# print('identifying test recall session')
# test_rec_sess = generate_sess(test_auc_df, MODEL_LIST, recall_score, 'recall')

# print('identifying evaluation recall platform')
# eval_rec_plat = generate_plat(eval_auc_df, MODEL_LIST, recall_score, 'recall')
# print('identifying test recall platform')
# test_rec_plat = generate_plat(test_auc_df, MODEL_LIST, recall_score, 'recall')



identifying evaluation recall session
Identifying max session time
Binning sessions
Sessions binned: performing melt
Sessions melted: dropping na
Converting to binary
Grouping by session bin, model, and experiment
Performing recall: zero division set to 1
Performing final metadata operations
identifying test recall session
Identifying max session time
Binning sessions
Sessions binned: performing melt
Sessions melted: dropping na
Converting to binary
Grouping by session bin, model, and experiment
Performing recall: zero division set to 1
Performing final metadata operations
identifying evaluation recall platform
Binning users based on platform time
Performing melt
Dropping na and renaming
Converting to binary
Grouping by user bin, model, and experiment
Performing recall: zero division set to 1
Performing final metadata operations
identifying test recall platform
Binning users based on platform time
Performing melt
Dropping na and renaming
Converting to binary
Grouping by user bin, model

In [31]:
# eval_rec_sess.to_csv('lstm_plots/metric_graphs/granular/eval_rec_sess.csv')
# test_rec_sess.to_csv('lstm_plots/metric_graphs/granular/test_rec_sess.csv')
# eval_rec_plat.to_csv('lstm_plots/metric_graphs/granular/eval_rec_plat.csv')
# test_rec_plat.to_csv('lstm_plots/metric_graphs/granular/test_rec_plat.csv') 

In [25]:
# plot_contours(
#     eval_rec_sess.copy(),
#     'session_bin',
#     'Window',
#     'experiment',
#     'recall',
#     'eval_rec_sess',
#     'Evaluation Recall | Session Size'
# )


In [26]:
# plot_contours(
#     test_rec_sess.copy(),
#     'session_bin',
#     'Window',
#     'experiment',
#     'recall',
#     'test_rec_sess',
#     'Test Recall | Session Size'
# )