In [50]:
import pandas as pd
import numpy as np

In [51]:
df_test = pd.read_csv('test.csv')

In [52]:
def explode_position_scalable(df_in, col_expl, pipe='|'):
    """Explode column col_expl of array type into multiple rows."""
    
    n = 60000  #chunk row size
    list_df = [df_in[i:i+n] for i in range(0,df_in.shape[0],n)]
    # Handle the first alone to have the correct headers
    df_x = explode_position(list_df[0], col_expl, flag_conversion=False, pipe_type=pipe)
    df_x = df_x.rename(columns={col_expl:'item_id'})
    df_x.to_csv('exploded.csv', index=False)
    list_df.pop(0)
    for df_x in list_df:
        df_x = explode_position(df_x, col_expl, flag_conversion=False, pipe_type=pipe)
        df_x = df_x.rename(columns={col_expl:'item_id'})
        df_x.to_csv('exploded.csv', mode='a', header=False, index=False)

    df_out = pd.read_csv('exploded.csv')

    os.remove('exploded.csv')
    return df_out
  
def explode_position(df_in, col_expl, flag_conversion = True, pipe_type='|'):
    """Explode column col_expl of array type into multiple rows."""

    df = df_in.copy()
    if(pipe_type== '|'):
        df.loc[:, col_expl] = df[col_expl].apply(string_to_array)
    elif(pipe_type==' '):
        df.loc[:, col_expl] = df[col_expl].apply(space_to_array)
    df.loc[:, 'position'] = df[col_expl].apply(get_position)
    df_out = pd.DataFrame(
        {col: np.repeat(df[col].values,
                        df[col_expl].str.len())
         for col in df.columns.drop(col_expl)}
    )
    df_out.loc[:, col_expl] = np.concatenate(df[col_expl].values)
    df_out.loc[:, 'position'] = np.concatenate(df['position'].values)
    if(flag_conversion):
        df_out.loc[:, col_expl] = df_out[col_expl].apply(int)

    return df_out
  
def string_to_array(s):
    """Convert pipe separated string to array."""

    if isinstance(s, str):
        out = s.split("|")
    elif math.isnan(s):
        out = []
    else:
        raise ValueError("Value must be either string of nan")
    return out
  
def get_position(l):
    pos = []
    for i in range(0,len(l)):
        pos.append(i)
    return pos

def create_recent_index(df_orig, grouped=False):
    # distinct_hotel = group.reference.drop_duplicates().values
    # dict = {}
    # counter = 0
    # for x in distinct_hotel:
    #     dict[x] = counter
    #     counter += 1
    df_list_int = df_orig.groupby('session_id').apply(lambda x: get_list_session_interactions(x)).reset_index(name='all_interactions')
    df_list_int = df_list_int[['session_id', 'all_interactions']]
    if(grouped):
        return df_list_int
    df_orig = (df_orig.merge(df_list_int, left_on=['session_id'], right_on=['session_id'], how="left"))
    #del df_orig['all_interactions']
    return df_orig
  
def get_list_session_interactions(group):
    group = group[~group['reference'].isnull()]
    group = group[group['action_type'] == 'clickout item']
    group.loc[:,'reference'] = group['reference'].apply(str)
    list_values = group.reference.drop_duplicates()
    joined = " ".join(list_values)
    return joined

def get_submission_target(df):
    """Identify target rows with missing click outs."""

    mask = df["reference"].isnull() & (df["action_type"] == "clickout item")
    df_out = df[mask]

    return df_out

In [26]:
def get_clickouts(df_test):
    df_test['step_max'] = df_test.groupby(['user_id'])['step'].transform(max)
    df_clickout = df_test[(df_test['step_max'] == df_test['step']) & (df_test['action_type'] == 'clickout item')]
    del df_clickout['step_max']
    return df_clickout

In [27]:
mask = (df_test["action_type"] == "clickout item") | (df_test["action_type"] == "interaction item rating") | (df_test["action_type"] == "interaction item image") | (df_test["action_type"] == "interaction item deals")
df_test = df_test[mask]

In [28]:
df_recent = create_recent_index(df_test)
df_recent

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices,all_interactions
0,03P4VFKK12UO,325fafb5fa450,1541107435,17,interaction item image,65685,US,"Bakersfield, USA",desktop,,,,65685
1,03P4VFKK12UO,325fafb5fa450,1541107435,18,interaction item image,65685,US,"Bakersfield, USA",desktop,,,,65685
2,03P4VFKK12UO,325fafb5fa450,1541107457,19,interaction item image,65685,US,"Bakersfield, USA",desktop,,,,65685
3,03P4VFKK12UO,325fafb5fa450,1541107457,20,interaction item image,65685,US,"Bakersfield, USA",desktop,,,,65685
4,03P4VFKK12UO,325fafb5fa450,1541107467,21,interaction item image,65685,US,"Bakersfield, USA",desktop,,,,65685
5,03P4VFKK12UO,325fafb5fa450,1541107467,22,interaction item image,65685,US,"Bakersfield, USA",desktop,,,,65685
6,03P4VFKK12UO,325fafb5fa450,1541107467,23,interaction item image,65685,US,"Bakersfield, USA",desktop,,,,65685
7,03P4VFKK12UO,325fafb5fa450,1541107467,24,interaction item image,65685,US,"Bakersfield, USA",desktop,,,,65685
8,03P4VFKK12UO,325fafb5fa450,1541107467,25,interaction item image,65685,US,"Bakersfield, USA",desktop,,,,65685
9,03P4VFKK12UO,325fafb5fa450,1541107467,26,interaction item image,65685,US,"Bakersfield, USA",desktop,,,,65685


In [29]:
def complete_prediction(df_test_nation):
    df_test_nation['item_recommendations'] = df_test_nation.apply(lambda x: fill_recs(x.impressions), axis=1)
    #print('No prediction for #' + str(df_missed.shape[0]) + 'items')
    df_out_nation = df_test_nation[['user_id', 'session_id', 'timestamp','step', 'item_recommendations']]
    return df_out_nation

def list_to_space_string(l):
    """Return a space separated string from a list"""
    s = " ".join(l)
    return s

def fill_recs(imp):
    l = imp.split('|')
    return list_to_space_string(l)

In [30]:
def complete_prediction_recent(df):
    df['item_recommendations'] = df.apply(lambda x: concat_interactions(x), axis=1)
    df_out = df[['user_id', 'session_id', 'timestamp','step', 'item_recommendations']]
    return df_out

def concat_interactions(row):
    spl_int = row.all_interactions.split(' ')
    spl_int = spl_int[::-1]
    spl_imp = row.impressions.split('|')
    result = []
    for i in spl_int:
        result.append(str(i))
    for i in spl_imp:
        if(str(i) not in result):
            result.append(str(i))
    return " ".join(result)

In [31]:
df_sub = get_submission_target(df_recent)

In [32]:
df_predicted = df_sub[df_sub['all_interactions'] != '']
df_not_predicted = df_sub[df_sub['all_interactions'] == '']

In [12]:
df_not_predicted = complete_prediction(df_not_predicted)
df_not_predicted.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,user_id,session_id,timestamp,step,item_recommendations
39,06S61EKCW1JY,22b9deb2da8f7,1541075258,1,6721 6724 40109 147227 80983 6719 40718 119405...
81,0NN8D1GWBW1F,84316dcbfb8e3,1541042711,1,9164598 1400686 2216204 2817576 4775794 568427...
84,0O1CTEP95YKQ,af793494b5f84,1541113537,22,69086 8533 925367 58949 61049 8581 8517 77520 ...
87,0T819ODEIRA4,7643893620070,1541085062,7,162011 52319 5170834 9332768 6398694 1830581 5...
88,0VNR91BTRLCP,0c086494b8b9b,1541062145,1,1204316 2140652 107674 1072440 2555490 2181166...


In [13]:
df_predicted = complete_prediction_recent(df_predicted)
df_predicted

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,user_id,session_id,timestamp,step,item_recommendations
38,03P4VFKK12UO,325fafb5fa450,1541107538,55,65685 1306936 56482 2842358 6881276 63259 6539...
80,09L0Y03JYTAC,91de23da01b00,1541034697,34,8561 8564 8568 8586 8588 102080 8613 55677 570...
107,0X3OMZZH37ZS,c0cdade1ed014,1541085912,8,434856 1556793 3206344 49682 1358429 658821 10...
138,2WEN1MLMALJM,d89f82d2b2538,1541069999,9,3508086 1632525 4050606 8404976 9169816 194295...
157,3WK1Z5XPGA43,48125fbfd5e9e,1541086351,15,3382114 4581320 3971456 1249659 5714694 829771...
281,5BR973YOX5QD,a24a3d2fb4543,1541081113,115,1708467 479176 1496061 513366 1627077 514881 8...
285,6IZO9K585LAZ,c3f91fb58c16f,1541085832,3,5750456 1544573 2391846 4898712 4029298 461989...
290,6U61SOZBFVB4,5bbc6b805c641,1541094213,5,112407 7204 7206 7215 15967 3566294 1892715 72...
406,6Z0VOF0I5B42,38bd410cd9d0a,1541062840,116,341471 49033 92964 3569682 107998 15238 107827...
425,75X35VP7UP3N,8d3ead9d73f28,1541084431,26,5708958 473691 4493756 10045128 103557 5788644...


In [14]:
df_out = pd.concat([df_predicted, df_not_predicted], ignore_index=True, sort=False)
#df_out.to_csv('sub_rule_based.csv')

In [15]:
df_out.head()

Unnamed: 0,user_id,session_id,timestamp,step,item_recommendations
0,03P4VFKK12UO,325fafb5fa450,1541107538,55,65685 1306936 56482 2842358 6881276 63259 6539...
1,09L0Y03JYTAC,91de23da01b00,1541034697,34,8561 8564 8568 8586 8588 102080 8613 55677 570...
2,0X3OMZZH37ZS,c0cdade1ed014,1541085912,8,434856 1556793 3206344 49682 1358429 658821 10...
3,2WEN1MLMALJM,d89f82d2b2538,1541069999,9,3508086 1632525 4050606 8404976 9169816 194295...
4,3WK1Z5XPGA43,48125fbfd5e9e,1541086351,15,3382114 4581320 3971456 1249659 5714694 829771...


In [48]:
import pandas as pd
import numpy as np
df_gt = pd.read_csv('gt.csv')
mask = (df_gt["action_type"] == "clickout item") | (df_gt["action_type"] == "interaction item rating") | (df_gt["action_type"] == "interaction item image") | (df_gt["action_type"] == "interaction item deals")
df_gt = df_gt[mask]
df_gt.head()

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices
16,03P4VFKK12UO,325fafb5fa450,1541107435,17,interaction item image,65685,US,"Bakersfield, USA",desktop,,,
17,03P4VFKK12UO,325fafb5fa450,1541107435,18,interaction item image,65685,US,"Bakersfield, USA",desktop,,,
18,03P4VFKK12UO,325fafb5fa450,1541107457,19,interaction item image,65685,US,"Bakersfield, USA",desktop,,,
19,03P4VFKK12UO,325fafb5fa450,1541107457,20,interaction item image,65685,US,"Bakersfield, USA",desktop,,,
20,03P4VFKK12UO,325fafb5fa450,1541107467,21,interaction item image,65685,US,"Bakersfield, USA",desktop,,,


In [34]:
df_gt = df_gt[['user_id', 'session_id', 'timestamp', 'step', 'reference']]
df_gt.head()

Unnamed: 0,user_id,session_id,timestamp,step,reference
16,03P4VFKK12UO,325fafb5fa450,1541107435,17,65685
17,03P4VFKK12UO,325fafb5fa450,1541107435,18,65685
18,03P4VFKK12UO,325fafb5fa450,1541107457,19,65685
19,03P4VFKK12UO,325fafb5fa450,1541107457,20,65685
20,03P4VFKK12UO,325fafb5fa450,1541107467,21,65685


In [35]:
df_pred_cleaned = df_predicted[['user_id', 'session_id', 'timestamp', 'step', 'all_interactions']]
df_pred_cleaned.head()

Unnamed: 0,user_id,session_id,timestamp,step,all_interactions
38,03P4VFKK12UO,325fafb5fa450,1541107538,55,65685
84,0O1CTEP95YKQ,af793494b5f84,1541113537,22,63051 2851635
87,0T819ODEIRA4,7643893620070,1541085062,7,5170834
107,0X3OMZZH37ZS,c0cdade1ed014,1541085912,8,1358429
138,2WEN1MLMALJM,d89f82d2b2538,1541069999,9,3508086 4050606


In [36]:
df_merged = (df_pred_cleaned.merge(df_gt, left_on=['user_id', 'session_id', 'timestamp', 'step'], right_on=['user_id', 'session_id', 'timestamp', 'step'], how="left"))
df_merged.head()

Unnamed: 0,user_id,session_id,timestamp,step,all_interactions,reference
0,03P4VFKK12UO,325fafb5fa450,1541107538,55,65685,1320460
1,0O1CTEP95YKQ,af793494b5f84,1541113537,22,63051 2851635,8602
2,0T819ODEIRA4,7643893620070,1541085062,7,5170834,5170834
3,0X3OMZZH37ZS,c0cdade1ed014,1541085912,8,1358429,434856
4,2WEN1MLMALJM,d89f82d2b2538,1541069999,9,3508086 4050606,8404976


In [39]:
df_merged['result'] = df_merged.apply(lambda x: 1 if (str(x.reference) in x.all_interactions) else 0,axis=1)
df_merged.head()

Unnamed: 0,user_id,session_id,timestamp,step,all_interactions,reference,result
0,03P4VFKK12UO,325fafb5fa450,1541107538,55,65685,1320460,0
1,0O1CTEP95YKQ,af793494b5f84,1541113537,22,63051 2851635,8602,0
2,0T819ODEIRA4,7643893620070,1541085062,7,5170834,5170834,1
3,0X3OMZZH37ZS,c0cdade1ed014,1541085912,8,1358429,434856,0
4,2WEN1MLMALJM,d89f82d2b2538,1541069999,9,3508086 4050606,8404976,0


In [44]:
tot_prediction = df_merged.shape[0]
correct = df_merged['result'].sum()
print('Tutte: ' + str(tot_prediction))
print('Azzeccate: ' + str(correct))
print('Percentuale: ' + str(correct/tot_prediction))

Tutte: 48898
Azzeccate: 16854
Percentuale: 0.34467667389259277


In [43]:
df_merged['result'].sum()

16854

In [49]:
df_gt[df_gt['user_id'] == '6Z0VOF0I5B42']

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices
408,6Z0VOF0I5B42,38bd410cd9d0a,1541061247,1,interaction item image,15238,FR,"Honfleur, France",mobile,,,
409,6Z0VOF0I5B42,38bd410cd9d0a,1541061247,2,interaction item image,15238,FR,"Honfleur, France",mobile,,,
410,6Z0VOF0I5B42,38bd410cd9d0a,1541061250,3,interaction item image,15238,FR,"Honfleur, France",mobile,,,
411,6Z0VOF0I5B42,38bd410cd9d0a,1541061250,4,interaction item image,15238,FR,"Honfleur, France",mobile,,,
412,6Z0VOF0I5B42,38bd410cd9d0a,1541061250,5,interaction item image,15238,FR,"Honfleur, France",mobile,,,
413,6Z0VOF0I5B42,38bd410cd9d0a,1541061252,6,interaction item image,15238,FR,"Honfleur, France",mobile,,,
414,6Z0VOF0I5B42,38bd410cd9d0a,1541061254,7,interaction item image,15238,FR,"Honfleur, France",mobile,,,
415,6Z0VOF0I5B42,38bd410cd9d0a,1541061256,8,interaction item image,15238,FR,"Honfleur, France",mobile,,,
416,6Z0VOF0I5B42,38bd410cd9d0a,1541061256,9,interaction item image,15238,FR,"Honfleur, France",mobile,,,
417,6Z0VOF0I5B42,38bd410cd9d0a,1541061256,10,interaction item image,15238,FR,"Honfleur, France",mobile,,,


In [45]:
df_merged.head(100)

Unnamed: 0,user_id,session_id,timestamp,step,all_interactions,reference,result
0,03P4VFKK12UO,325fafb5fa450,1541107538,55,65685,1320460,0
1,0O1CTEP95YKQ,af793494b5f84,1541113537,22,63051 2851635,8602,0
2,0T819ODEIRA4,7643893620070,1541085062,7,5170834,5170834,1
3,0X3OMZZH37ZS,c0cdade1ed014,1541085912,8,1358429,434856,0
4,2WEN1MLMALJM,d89f82d2b2538,1541069999,9,3508086 4050606,8404976,0
5,3CB5DVB0CC0S,776a6ed9cf3b7,1541075304,2,8186812,8186812,1
6,3WK1Z5XPGA43,48125fbfd5e9e,1541086351,15,9049620,4581320,0
7,4GVU9AT4RVBW,b42f4cd99975e,1541110335,8,42161 1890831 33166,33166,1
8,6Z0VOF0I5B42,38bd410cd9d0a,1541062840,116,107827 92964 49033,341471,0
9,75X35VP7UP3N,8d3ead9d73f28,1541084431,26,5708958 473691 4493756 103557 5788644 5167086,5708958,1
