In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

from tqdm import tqdm

from multiprocessing import Pool

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [2]:
from pandas_profiling import ProfileReport
pd.set_option("display.max_columns", None)

# Construccion de las secuencias

### Organizacion de las sesiones por fecha

In [26]:
train_purchases = pd.read_csv('../Data/dressipi_recsys2022/train_purchases.csv')
train_sessions = pd.read_csv('../Data/dressipi_recsys2022/train_sessions.csv')

In [29]:
train_sessions

Unnamed: 0,session_id,item_id,date
0,3,9655,2020-12-18 21:25:00.373
1,3,9655,2020-12-18 21:19:48.093
2,13,15654,2020-03-13 19:35:27.136
3,18,18316,2020-08-26 19:18:30.833
4,18,2507,2020-08-26 19:16:31.211
...,...,...,...
4743815,4440001,20409,2020-10-30 23:37:20.658
4743816,4440001,14155,2020-10-30 23:31:56.607
4743817,4440001,14303,2020-10-30 23:36:17.934
4743818,4440001,27852,2020-10-30 23:39:55.186


In [30]:
train_sessions.date

0          2020-12-18 21:25:00.373
1          2020-12-18 21:19:48.093
2          2020-03-13 19:35:27.136
3          2020-08-26 19:18:30.833
4          2020-08-26 19:16:31.211
                    ...           
4743815    2020-10-30 23:37:20.658
4743816    2020-10-30 23:31:56.607
4743817    2020-10-30 23:36:17.934
4743818    2020-10-30 23:39:55.186
4743819    2020-10-30 23:27:37.873
Name: date, Length: 4743820, dtype: object

In [34]:
train_sessions["date"] = pd.to_datetime(train_sessions["date"])

In [32]:
train_sessions.date

0         2020-12-18 21:25:00.373
1         2020-12-18 21:19:48.093
2         2020-03-13 19:35:27.136
3         2020-08-26 19:18:30.833
4         2020-08-26 19:16:31.211
                    ...          
4743815   2020-10-30 23:37:20.658
4743816   2020-10-30 23:31:56.607
4743817   2020-10-30 23:36:17.934
4743818   2020-10-30 23:39:55.186
4743819   2020-10-30 23:27:37.873
Name: date, Length: 4743820, dtype: datetime64[ns]

In [35]:
train_sessions.sort_values(by='date', inplace=True)
train_sessions

Unnamed: 0,session_id,item_id,date
4005618,3747794,11234,2020-01-01 00:00:01.359
3699175,3458777,3868,2020-01-01 00:00:21.440
4005616,3747794,9214,2020-01-01 00:00:22.294
4649622,4350716,10861,2020-01-01 00:00:48.505
4005614,3747794,27092,2020-01-01 00:02:14.538
...,...,...,...
2007607,1876524,18969,2021-05-31 23:41:37.372
3927982,3674124,2267,2021-05-31 23:41:49.965
2007606,1876524,11226,2021-05-31 23:42:01.479
3927983,3674124,23412,2021-05-31 23:43:02.747


In [41]:
sessions_ids = list(train_sessions.session_id.unique())
sessions_ids.sort()
print(sessions_ids[0:10])
print(len(sessions_ids))

[3, 13, 18, 19, 24, 28, 31, 36, 42, 44]
1000000


## Implementacion mononucleo

## Implementacion multinucleo

In [139]:
def generate_secuence(session_id):
    
    item_id_purchase = train_purchases.loc[train_purchases.session_id == session_id].item_id.values[0]
    items_ids_history = list(train_sessions.loc[train_sessions.session_id == session_id].item_id)
    items_ids_history_dates = list(train_sessions.loc[train_sessions.session_id == session_id].date.astype(str))
    
    return session_id, item_id_purchase, items_ids_history, items_ids_history_dates

In [140]:
%%time
with Pool(24) as my_pool:
    result_sequences = my_pool.map(generate_secuence, sessions_ids)

CPU times: user 42.8 s, sys: 1.72 s, total: 44.5 s
Wall time: 26min 34s


In [141]:
sessions_ids_result, target_item_ids, historical_items_ids, historical_items_ids_dates = zip(*result_sequences)

In [147]:
sessions_ids_result[0:5]

(3, 13, 18, 19, 24)

In [148]:
target_item_ids[0:5]

(15085, 18626, 24911, 12534, 13226)

In [149]:
historical_items_ids[0:5]

([9655, 9655],
 [15654],
 [4026, 2507, 18316],
 [19896,
  27937,
  12804,
  25772,
  6341,
  23687,
  8316,
  8281,
  8813,
  1755,
  27638,
  20033,
  25555,
  4385,
  18936,
  6704,
  8268],
 [2927, 2927, 16064, 11662, 434, 18539, 10414, 28075, 18476])

In [151]:
historical_items_ids_dates[0:5]

(['2020-12-18 21:19:48.093', '2020-12-18 21:25:00.373'],
 ['2020-03-13 19:35:27.136'],
 ['2020-08-26 19:15:47.232',
  '2020-08-26 19:16:31.211',
  '2020-08-26 19:18:30.833'],
 ['2020-11-02 16:30:36.378',
  '2020-11-02 16:30:48.207',
  '2020-11-02 16:31:05.749',
  '2020-11-02 16:31:18.543',
  '2020-11-02 16:34:33.794',
  '2020-11-02 16:35:46.685',
  '2020-11-02 16:36:10.084',
  '2020-11-02 16:38:04.484',
  '2020-11-02 16:39:36.928',
  '2020-11-02 16:40:12.193',
  '2020-11-02 16:41:36.202',
  '2020-11-02 16:42:02.287',
  '2020-11-02 16:43:04.022',
  '2020-11-02 16:44:56.169',
  '2020-11-02 16:47:17.523',
  '2020-11-02 16:47:36.119',
  '2020-11-02 16:48:39.343'],
 ['2020-02-26 17:22:48.903',
  '2020-02-26 17:23:04.840',
  '2020-02-26 17:26:05.585',
  '2020-02-26 17:26:29.178',
  '2020-02-26 17:27:11.894',
  '2020-02-26 18:18:48.152',
  '2020-02-26 18:21:51.891',
  '2020-02-26 18:22:34.101',
  '2020-02-26 18:24:32.770'])

In [160]:
secuencias_entrenamiento_df = pd.DataFrame(
    {
     'session_id': sessions_ids_result,
     'item_id_salida': target_item_ids,
     'secuencia_items_ids': historical_items_ids,
     'secuencia_fechas': historical_items_ids_dates
    })

secuencias_entrenamiento_df

Unnamed: 0,session_id,item_id_salida,secuencia_items_ids,secuencia_fechas
0,3,15085,"[9655, 9655]","[2020-12-18 21:19:48.093, 2020-12-18 21:25:00...."
1,13,18626,[15654],[2020-03-13 19:35:27.136]
2,18,24911,"[4026, 2507, 18316]","[2020-08-26 19:15:47.232, 2020-08-26 19:16:31...."
3,19,12534,"[19896, 27937, 12804, 25772, 6341, 23687, 8316...","[2020-11-02 16:30:36.378, 2020-11-02 16:30:48...."
4,24,13226,"[2927, 2927, 16064, 11662, 434, 18539, 10414, ...","[2020-02-26 17:22:48.903, 2020-02-26 17:23:04...."
...,...,...,...,...
999995,4439986,2915,"[23502, 13914, 13403, 19310, 12373, 27733]","[2021-05-13 11:46:56.606, 2021-05-13 11:48:21...."
999996,4439990,8786,"[22093, 17429, 1389, 1131, 4062, 26011, 4028, ...","[2020-08-22 11:38:48.785, 2020-08-22 11:39:20...."
999997,4439994,21630,[25357],[2020-11-27 20:08:37.262]
999998,4439999,16962,"[6007, 8502, 13056, 2173, 8433, 1299, 15853]","[2020-11-27 10:52:12.577, 2020-11-27 10:53:09...."


In [161]:
secuencias_entrenamiento_df.to_csv('../Data/secuencias_entrenamiento.csv', index=False)

# Secuencias de test para learderboard

In [3]:
leaderboard_sessions = pd.read_csv('../Data/dressipi_recsys2022/test_leaderboard_sessions.csv')
leaderboard_sessions

Unnamed: 0,session_id,item_id,date
0,26,19185,2021-06-16 09:53:54.158
1,200,17089,2021-06-25 12:23:40.811
2,200,17089,2021-06-25 12:24:36.631
3,200,8060,2021-06-25 12:24:41.677
4,200,4758,2021-06-25 12:24:50.692
...,...,...,...
229349,4439653,25955,2021-06-11 10:22:57.47
229350,4439653,12179,2021-06-11 10:23:00.663
229351,4439757,2078,2021-06-30 11:42:15.073
229352,4439757,2078,2021-06-30 11:43:13.725


In [4]:
leaderboard_sessions["date"] = pd.to_datetime(leaderboard_sessions["date"])
leaderboard_sessions.sort_values(by='date', inplace=True)
leaderboard_sessions

Unnamed: 0,session_id,item_id,date
88990,1719459,18156,2021-06-01 00:01:01.019
193239,3740715,12205,2021-06-01 00:01:36.571
135216,2614780,8060,2021-06-01 00:01:36.689
88991,1719459,21138,2021-06-01 00:01:37.836
182243,3540851,23315,2021-06-01 00:02:28.161
...,...,...,...
26443,502146,12179,2021-06-30 23:47:57.863
60446,1164531,25699,2021-06-30 23:48:34.124
60447,1164531,19245,2021-06-30 23:48:41.642
60448,1164531,17574,2021-06-30 23:48:51.731


In [5]:
sessions_ids_test_learderboard = list(leaderboard_sessions.session_id.unique())
sessions_ids_test_learderboard.sort()
print(sessions_ids_test_learderboard[0:10])
print(len(sessions_ids_test_learderboard))

[26, 200, 205, 495, 521, 587, 721, 810, 886, 1178]
50000


In [9]:
def generate_secuence_test_leaderboard(session_id):
    
    items_ids_history = list(leaderboard_sessions.loc[leaderboard_sessions.session_id == session_id].item_id)
    items_ids_history_dates = list(leaderboard_sessions.loc[leaderboard_sessions.session_id == session_id].date.astype(str))
    
    return session_id, items_ids_history, items_ids_history_dates

In [10]:
%%time
with Pool(24) as my_pool:
    result_sequences_test_leaderboard = my_pool.map(generate_secuence_test_leaderboard, sessions_ids_test_learderboard)

CPU times: user 415 ms, sys: 91.2 ms, total: 506 ms
Wall time: 2.47 s


In [11]:
sessions_ids_result_test_leaderboard, historical_items_ids_test_leaderboard, historical_items_ids_dates_test_leaderboard = zip(*result_sequences_test_leaderboard)

In [12]:
secuencias_test_leaderboard_df = pd.DataFrame(
    {
     'session_id': sessions_ids_result_test_leaderboard,
     'secuencia_items_ids': historical_items_ids_test_leaderboard,
     'secuencia_fechas': historical_items_ids_dates_test_leaderboard
    })

secuencias_test_leaderboard_df

Unnamed: 0,session_id,secuencia_items_ids,secuencia_fechas
0,26,[19185],[2021-06-16 09:53:54.158]
1,200,"[17089, 17089, 8060, 4758]","[2021-06-25 12:23:40.811, 2021-06-25 12:24:36...."
2,205,[8194],[2021-06-11 00:28:07.058]
3,495,[6853],[2021-06-14 22:13:06.741]
4,521,[26471],[2021-06-19 13:50:03.090]
...,...,...,...
49995,4439446,"[20770, 20770, 21396]","[2021-06-11 17:16:12.243, 2021-06-11 17:16:19...."
49996,4439458,"[11715, 26499]","[2021-06-09 04:52:02.785, 2021-06-09 04:54:45...."
49997,4439550,[19086],[2021-06-02 17:42:40.481]
49998,4439653,"[4813, 14075, 14075, 4813, 12179, 2769, 16479,...","[2021-06-11 10:19:35.472, 2021-06-11 10:19:39...."


In [13]:
secuencias_test_leaderboard_df.to_csv('../Data/secuencias_test_leaderboard_df.csv', index=False)

# Secuencias de test para final

In [14]:
final_sessions = pd.read_csv('../Data/dressipi_recsys2022/test_final_sessions.csv')
final_sessions["date"] = pd.to_datetime(final_sessions["date"])
final_sessions.sort_values(by='date', inplace=True)
final_sessions

Unnamed: 0,session_id,item_id,date
64022,1261706,25389,2021-06-01 00:00:09.399
109254,2136231,4981,2021-06-01 00:00:16.520
64023,1261706,21310,2021-06-01 00:00:31.451
31384,621029,11226,2021-06-01 00:00:33.528
121673,2377643,12901,2021-06-01 00:00:34.844
...,...,...,...
79023,1557119,366,2021-06-30 23:20:46.049
47094,920628,13095,2021-06-30 23:20:59.132
176665,3475398,289,2021-06-30 23:22:19.131
54146,1051350,11693,2021-06-30 23:32:32.921


In [15]:
sessions_ids_test_final = list(final_sessions.session_id.unique())
sessions_ids_test_final.sort()
print(sessions_ids_test_final[0:10])
print(len(sessions_ids_test_final))

[61, 96, 185, 224, 285, 400, 580, 660, 663, 792]
50000


In [16]:
def generate_secuence_test_final(session_id):
    
    items_ids_history = list(final_sessions.loc[final_sessions.session_id == session_id].item_id)
    items_ids_history_dates = list(final_sessions.loc[final_sessions.session_id == session_id].date.astype(str))
    
    return session_id, items_ids_history, items_ids_history_dates

In [17]:
%%time
with Pool(24) as my_pool:
    result_sequences_test_final = my_pool.map(generate_secuence_test_final, sessions_ids_test_final)

CPU times: user 380 ms, sys: 112 ms, total: 492 ms
Wall time: 2.45 s


In [18]:
sessions_ids_result_test_final, historical_items_ids_test_final, historical_items_ids_dates_test_final = zip(*result_sequences_test_final)

In [19]:
secuencias_test_final_df = pd.DataFrame(
    {
     'session_id': sessions_ids_result_test_final,
     'secuencia_items_ids': historical_items_ids_test_final,
     'secuencia_fechas': historical_items_ids_dates_test_final
    })

secuencias_test_final_df

Unnamed: 0,session_id,secuencia_items_ids,secuencia_fechas
0,61,[27088],[2021-06-01 08:12:39.664]
1,96,"[11693, 18298, 4738, 495, 6871]","[2021-06-19 17:48:05.227, 2021-06-19 17:49:08...."
2,185,"[17618, 21330, 21330, 17618, 4983]","[2021-06-07 15:53:21.640, 2021-06-07 15:53:29...."
3,224,"[24665, 11917]","[2021-06-14 10:31:39.990, 2021-06-14 16:03:12...."
4,285,[15073],[2021-06-29 15:33:39.601]
...,...,...,...
49995,4439646,"[1883, 23252, 20208, 6786]","[2021-06-19 18:41:05.529, 2021-06-19 18:41:48...."
49996,4439648,[7154],[2021-06-14 08:03:19.024]
49997,4439675,[23067],[2021-06-01 12:21:07.959]
49998,4439868,[26085],[2021-06-16 22:18:27.509]


In [20]:
secuencias_test_final_df.to_csv('../Data/secuencias_test_final_df.csv', index=False)