In [1]:
#
# The MIT License (MIT)

# Copyright (c) 2021, NVIDIA CORPORATION

# Permission is hereby granted, free of charge, to any person obtaining a copy of
# this software and associated documentation files (the "Software"), to deal in
# the Software without restriction, including without limitation the rights to
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
# the Software, and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#

## Library packages

In [2]:
import  pandas as pd 
import numpy as np
# import xgboost as xgb
import argparse
import os 
import json
import random
import time

from sigir_ecom_challenge_code.evaluation_sigir import cart_abandonment_metric
from sigir_ecom_challenge_code.submission.uploader import upload_submission

import sklearn.metrics as metrics
import matplotlib.pyplot as plt 

import yaml 
import glob
from sklearn.metrics import f1_score


In [3]:
# Function for display dataframe info quickly
def di(df, rows=5):
    display(df.shape)
    display(df.head(rows))



### Data paths

In [4]:
preds_path = '/workspace/SIGIR-ecom-data-challenge/script/eda/submitted_predictions/'

In [5]:
i = 0
preds_files = []
for root, dirs, files in os.walk(preds_path):
     for file in files:
            if 'ensemble_predictions' in file:
#                 print(i+1, os.path.join(root, file))
                preds_files.append(os.path.join(root, file))
                
# all_preds = pd.concat([pd.read_parquet(file) for file in preds_files])

In [6]:
i = 0
preds = pd.read_parquet(preds_files[0]).rename(columns={"predictions": "preds_"+str(i)})
preds = preds.set_index('session_id_hash')
for file in preds_files[1:]:
    df = pd.read_parquet(file)
    preds = preds.merge(df[['session_id_hash', 'predictions']].rename(columns={"predictions": "preds_"+str(i+1)}).set_index('session_id_hash'), left_index=True, right_index=True)
    i+=1

In [7]:
def pred_to_level(x):
    if x>=thres:
        return 1
    else:
        return 0

In [8]:
preds.head(2)

Unnamed: 0_level_0,preds_0,nb_after_add-last,preds_1,preds_2,preds_3,preds_4,preds_5,preds_6,preds_7,preds_8,preds_9,preds_10,preds_11,preds_12,preds_13,preds_14,preds_15,preds_16
session_id_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
4ff0745a026ef4fdd17e15dd88bafba67b40e5edb2133a1e2a29c75f0919083d,1.9e-05,0.0,0.035373,1e-06,0.041376,0.060514,0.001236,0.126883,0.000188,0.007995,0.005289,0.001119,0.16082,0.141987,0.011216,0.03661,0.038104,0.061674
4ff2c98c26c25b62f7b6d601f4d3571f74a85c901b5598bf17911f357fef7f29,0.013812,2.0,0.019685,0.002678,0.0261,0.016511,0.040106,0.295859,0.008736,0.066712,0.060427,0.037367,0.175184,0.160285,0.033929,0.012173,0.02121,0.065882


In [9]:
preds = preds.reset_index()
preds = preds[['session_id_hash', 'nb_after_add-last', 'preds_0', 'preds_1', 'preds_2',
       'preds_3', 'preds_4', 'preds_5', 'preds_6', 'preds_7', 'preds_8',
       'preds_9', 'preds_10', 'preds_11', 'preds_12', 'preds_13', 'preds_14', 'preds_15']]
preds.head(2)

Unnamed: 0,session_id_hash,nb_after_add-last,preds_0,preds_1,preds_2,preds_3,preds_4,preds_5,preds_6,preds_7,preds_8,preds_9,preds_10,preds_11,preds_12,preds_13,preds_14,preds_15
0,4ff0745a026ef4fdd17e15dd88bafba67b40e5edb2133a...,0.0,1.9e-05,0.035373,1e-06,0.041376,0.060514,0.001236,0.126883,0.000188,0.007995,0.005289,0.001119,0.16082,0.141987,0.011216,0.03661,0.038104
1,4ff2c98c26c25b62f7b6d601f4d3571f74a85c901b5598...,2.0,0.013812,0.019685,0.002678,0.0261,0.016511,0.040106,0.295859,0.008736,0.066712,0.060427,0.037367,0.175184,0.160285,0.033929,0.012173,0.02121


In [10]:
for col in preds.columns[2:]:
    c_df = preds[preds[col]>0.5]
#     print(col, c_df[c_df['nb_after_add-last']==2].shape)

In [11]:
lbs = [3.634640474, 3.633469479, 3.634615097, 3.633670376, 3.632208409, 3.628215283, 3.634738797, 
       3.625526504, 3.626060329, 3.626461076, 
       3.63, 3.63, 3.63, 3.63, 3.63, 3.63 ]
# Just setting lb as 3.63 for some model for filtering them out. 3.63 doesn't reflect their true lb score.

In [12]:
t_cols = ['session_id_hash', 'nb_after_add-last']
i = 0
for col in preds.columns[2:]:
    if lbs[i]>3.63:
        # Keeping score over 3.63
        t_cols.append(col)
    i+=1
t_df = preds[t_cols]
t_df.head(2)

Unnamed: 0,session_id_hash,nb_after_add-last,preds_0,preds_1,preds_2,preds_3,preds_4,preds_6
0,4ff0745a026ef4fdd17e15dd88bafba67b40e5edb2133a...,0.0,1.9e-05,0.035373,1e-06,0.041376,0.060514,0.126883
1,4ff2c98c26c25b62f7b6d601f4d3571f74a85c901b5598...,2.0,0.013812,0.019685,0.002678,0.0261,0.016511,0.295859


In [13]:
thres = 0.5
print('nb_after_add | Rows  | DLRM 18 | DLRM 22 | DLRM 28 | balanced | XGB 13 WT | XGB 10 NT |  ')
print('---------------------------------------------------------------------------------------')
for val in np.unique(t_df['nb_after_add-last'].values):
    c_df = t_df[t_df['nb_after_add-last']==val].reset_index(drop=True)
    print('%12r | %5r | %7r | %7r | %8r | %9r | %9r | %7r |'
          %(val, c_df.shape[0], c_df[c_df['preds_1']>=thres].shape[0],  c_df[c_df['preds_2']>=thres].shape[0], 
            c_df[c_df['preds_0']>=thres].shape[0], c_df[c_df['preds_3']>=thres].shape[0],
           c_df[c_df['preds_4']>=thres].shape[0], c_df[c_df['preds_6']>=thres].shape[0]))
    print()
print('%12r | %5r | %0.5f | %0.5f | %0.6f | %0.6f  | %0.6f  | %0.5f |'
          %('LB Score', 'NA', lbs[1], lbs[2], lbs[0], lbs[3], lbs[4], lbs[6] ))

nb_after_add | Rows  | DLRM 18 | DLRM 22 | DLRM 28 | balanced | XGB 13 WT | XGB 10 NT |  
---------------------------------------------------------------------------------------
         0.0 | 26921 |       1 |      43 |      106 |        22 |        16 |     102 |

         2.0 |  8254 |       2 |      45 |       65 |         0 |        34 |      66 |

         4.0 |  4725 |      16 |      42 |       64 |         0 |        11 |      51 |

         6.0 |  3331 |      51 |      40 |       69 |        48 |        19 |      44 |

         8.0 |  2478 |      43 |      20 |       49 |        28 |        18 |      44 |

        10.0 |  2002 |      51 |      19 |       43 |        30 |        15 |      35 |

  'LB Score' |  'NA' | 3.63347 | 3.63462 | 3.634640 | 3.633670  | 3.632208  | 3.63474 |


### Mean preds value

In [14]:
fName = 'Te_GE_mean_values_05_lb_greater_63_DLRM28_Balanced_XGB10_XGB13.parquet'
t_df['predictions'] = t_df[t_df.columns[2:]].mean(axis=1)
thres = 0.5
t_df['predictions'] = t_df['predictions'].apply(pred_to_level)
t_df[['session_id_hash', 'predictions', 'nb_after_add-last']].to_parquet(fName, index=False)
t_df[t_df['predictions']>0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  t_df['predictions'] = t_df[t_df.columns[2:]].mean(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  t_df['predictions'] = t_df['predictions'].apply(pred_to_level)


Unnamed: 0,session_id_hash,nb_after_add-last,preds_0,preds_1,preds_2,preds_3,preds_4,preds_6,predictions
33,49c0552adee7f7d3f368aac77e71e1eb9763dbb3a31242...,6.0,0.932199,0.588894,0.490192,0.597320,0.624779,0.516676,1
1131,120b25673f877132f9891560a00838aeb8c9359ff910c5...,2.0,0.880691,0.149207,0.629292,0.194120,0.669184,0.509797,1
1187,399abac015e4320499b164629391576e3bdae814c859d0...,6.0,0.966857,0.542023,0.508697,0.412153,0.531172,0.590181,1
2757,45d4c96cbe580c2ea04b342178f565a8b7e62a96b49758...,6.0,0.983746,0.283141,0.679955,0.329565,0.630533,0.405900,1
2878,4ca7fbfe096fa72878d25f6b4b62381d5d4326d6e0fe47...,10.0,0.895151,0.510215,0.640882,0.525157,0.575686,0.555091,1
...,...,...,...,...,...,...,...,...,...
44613,e971258dfaf37a77fa86da849665624be847530e4c7dfb...,8.0,0.891221,0.328121,0.461412,0.418700,0.628962,0.493450,1
46893,e69cfa8ca4135f4d081e609777033c84bd8c71500ecba8...,6.0,0.960147,0.339147,0.637630,0.378800,0.601379,0.581422,1
47327,da299113611f08839dd26ce4f44271b0832e14e30eb024...,6.0,0.994733,0.280884,0.694771,0.359082,0.443402,0.515787,1
47431,edf9dec9f928a2bd6a37ae42336448e7ab7eb7e20caed8...,4.0,0.747973,0.579953,0.554240,0.437925,0.489723,0.579347,1


### Voting with different 1's

In [15]:
t_df = t_df[['session_id_hash', 'nb_after_add-last', 'preds_0', 'preds_3', 'preds_4', 'preds_6']]
# t_df.head()

In [16]:
thres = 0.5
for col in t_df.columns[2:]:
    t_df[col] = t_df[col].apply(pred_to_level)
preds.head()

Unnamed: 0,session_id_hash,nb_after_add-last,preds_0,preds_1,preds_2,preds_3,preds_4,preds_5,preds_6,preds_7,preds_8,preds_9,preds_10,preds_11,preds_12,preds_13,preds_14,preds_15
0,4ff0745a026ef4fdd17e15dd88bafba67b40e5edb2133a...,0.0,1.9e-05,0.035373,1e-06,0.041376,0.060514,0.001236,0.126883,0.000188,0.007995,0.005289,0.001119,0.16082,0.141987,0.011216,0.03661,0.038104
1,4ff2c98c26c25b62f7b6d601f4d3571f74a85c901b5598...,2.0,0.013812,0.019685,0.002678,0.0261,0.016511,0.040106,0.295859,0.008736,0.066712,0.060427,0.037367,0.175184,0.160285,0.033929,0.012173,0.02121
2,4ff2e475315650ab1c9937325d68771ae2c23f135a3d57...,0.0,3.6e-05,0.03078,1e-06,0.043173,0.060514,0.000996,0.105027,0.000191,0.007538,0.005031,0.001086,0.16082,0.141987,0.01114,0.028344,0.031471
3,4ff30d014d3c6a3e17ff5bb6eaec683e32eafdc394e6dc...,8.0,0.013703,0.253431,0.008262,0.128816,0.046368,0.044089,0.356488,0.011791,0.071774,0.071292,0.069119,0.237375,0.303918,0.207004,0.068987,0.117613
4,1715ee238300b8bbdd4d852351fda66aaaf5a1c059b614...,6.0,0.11254,0.08939,0.114204,0.048144,0.118846,0.386277,0.429219,0.070843,0.289962,0.352367,0.412753,0.320492,0.291576,0.364998,0.020571,0.036096


In [17]:
for col in t_df.columns[2:]:
    print(t_df[col].sum())

396
128
113
342


In [18]:
ones = []
for k in range(len(t_df)):
    sums = t_df[t_df.columns[2:]].iloc[k].sum()
    ones.append(sums)
t_df['ones'] = ones

In [19]:
# Creating prediction file with different number of 1's prediction. 
# Change the value 
num_ones = 3
t_df['predictions'] = 0
for k in range(len(t_df)):
    if t_df['ones'][k]>num_ones:
        t_df.at[k, 'predictions'] = 1

### Creating prediction files with 1's prediction

In [20]:
fName = 'GE_majority_1s_05_lb_greater_63_DLRM28_Balanced_XGB10_XGB13.parquet'
t_df['predictions'] = t_df[t_df.columns[2:]].mean(axis=1)
thres = 0.5
t_df['predictions'] = t_df['predictions'].apply(pred_to_level)
t_df[['session_id_hash', 'predictions', 'nb_after_add-last']].to_parquet(fName, index=False)
t_df[t_df['predictions']>0].head(2)

Unnamed: 0,session_id_hash,nb_after_add-last,preds_0,preds_3,preds_4,preds_6,ones,predictions
13,27e8b68a8cccd0d0df0d640b935743100fc418dca1a171...,2.0,0,0,1,1,2,1
33,49c0552adee7f7d3f368aac77e71e1eb9763dbb3a31242...,6.0,1,1,1,1,4,1


In [23]:
def generate_submission_file(test_predictions_df, threshold, data_path): 
    #load json file 
    with open(os.path.join(data_path, "intention_test_phase_2.json")) as json_file:
        # read the test cases from the provided file
        test_queries = json.load(json_file)
        test_df = pd.json_normalize(test_queries, 'query', 'nb_after_add')
        test_df = test_df.drop_duplicates('session_id_hash')
#         test_df = test_df[test_df['nb_after_add']<10].reset_index().drop(columns=['index'])
        
        
    assert len(test_predictions_df) == len(test_df)
    #merge predictions frame and provided test_df to insure same order of sessions 
    test_df = test_df.merge(test_predictions_df, on='session_id_hash', how='left')
    
    
    preds = (test_df.predictions.values > threshold).reshape(-1).astype(int).tolist()
    print("Number of purchases predicted in test set is: %s" %np.sum(preds))
    
    # Convert to required prediction format
    preds = [{'label':pred} for pred in preds]
    
    
    local_prediction_file = "{}_{}.json".format(
        'gspmoreira_gmail.com'.replace("@", "_"), round(time.time() * 1000)
    )
    
    local_prediction_file_path = os.path.join(
        '/workspace/SIGIR-ecom-data-challenge/script/xgboost', local_prediction_file
    )
    print("Generating JSON file with predictions")
    with open(local_prediction_file_path, "w") as fp:
        json.dump(preds, fp, indent=2)

    return local_prediction_file_path

In [24]:
Data_PATH = '/workspace/SIGIR-ecom-data-challenge/data/coveo_task2_v3_balanced_phase2/xgboost_data'
final_sub = t_df[['session_id_hash', 'predictions', 'nb_after_add-last']]
generate_submission_file(final_sub, 0.5, Data_PATH)

Number of purchases predicted in test set is: 191
Generating JSON file with predictions


'/workspace/SIGIR-ecom-data-challenge/script/xgboost/gspmoreira_gmail.com_1625686420863.json'

In [25]:
# local_prediction_file_path = generate_submission_file(final_sub, 0.5, Data_PATH)
local_prediction_file_path = '/workspace/SIGIR-ecom-data-challenge/script/xgboost/gspmoreira_gmail.com_1625686420863.json'


In [26]:
print(f"************* Uploading the submission file *************")
TASK = "cart"  # 'rec' or 'cart'
upload_submission(local_file=local_prediction_file_path, task=TASK)

************* Uploading the submission file *************
Starting submission at 2021-07-07 19:34:12.282033...


All done at 2021-07-07 19:34:13.014168: see you, space cowboy!
