In [25]:
import pandas as pd
import numpy as np

In [26]:
df_test = pd.read_csv('test.csv')

In [27]:
def explode_position_scalable(df_in, col_expl, pipe='|'):
    """Explode column col_expl of array type into multiple rows."""
    
    n = 60000  #chunk row size
    list_df = [df_in[i:i+n] for i in range(0,df_in.shape[0],n)]
    # Handle the first alone to have the correct headers
    df_x = explode_position(list_df[0], col_expl, flag_conversion=False, pipe_type=pipe)
    df_x = df_x.rename(columns={col_expl:'item_id'})
    df_x.to_csv('exploded.csv', index=False)
    list_df.pop(0)
    for df_x in list_df:
        df_x = explode_position(df_x, col_expl, flag_conversion=False, pipe_type=pipe)
        df_x = df_x.rename(columns={col_expl:'item_id'})
        df_x.to_csv('exploded.csv', mode='a', header=False, index=False)

    df_out = pd.read_csv('exploded.csv')

    os.remove('exploded.csv')
    return df_out
  
def explode_position(df_in, col_expl, flag_conversion = True, pipe_type='|'):
    """Explode column col_expl of array type into multiple rows."""

    df = df_in.copy()
    if(pipe_type== '|'):
        df.loc[:, col_expl] = df[col_expl].apply(string_to_array)
    elif(pipe_type==' '):
        df.loc[:, col_expl] = df[col_expl].apply(space_to_array)
    df.loc[:, 'position'] = df[col_expl].apply(get_position)
    df_out = pd.DataFrame(
        {col: np.repeat(df[col].values,
                        df[col_expl].str.len())
         for col in df.columns.drop(col_expl)}
    )
    df_out.loc[:, col_expl] = np.concatenate(df[col_expl].values)
    df_out.loc[:, 'position'] = np.concatenate(df['position'].values)
    if(flag_conversion):
        df_out.loc[:, col_expl] = df_out[col_expl].apply(int)

    return df_out
  
def string_to_array(s):
    """Convert pipe separated string to array."""

    if isinstance(s, str):
        out = s.split("|")
    elif math.isnan(s):
        out = []
    else:
        raise ValueError("Value must be either string of nan")
    return out
  
def get_position(l):
    pos = []
    for i in range(0,len(l)):
        pos.append(i)
    return pos

def create_recent_index(df_orig, grouped=False):
    # distinct_hotel = group.reference.drop_duplicates().values
    # dict = {}
    # counter = 0
    # for x in distinct_hotel:
    #     dict[x] = counter
    #     counter += 1
    df_list_int = df_orig.groupby('session_id').apply(lambda x: get_list_session_interactions(x)).reset_index(name='all_interactions')
    df_list_int = df_list_int[['session_id', 'all_interactions']]
    if(grouped):
        return df_list_int
    df_orig = (df_orig.merge(df_list_int, left_on=['session_id'], right_on=['session_id'], how="left"))
    #del df_orig['all_interactions']
    return df_orig
  
def get_list_session_interactions(group):
    group = group[~group['reference'].isnull()]
    group.loc[:,'reference'] = group['reference'].apply(str)
    list_values = group.reference.drop_duplicates()
    joined = " ".join(list_values)
    return joined

def get_submission_target(df):
    """Identify target rows with missing click outs."""

    mask = df["reference"].isnull() & (df["action_type"] == "clickout item")
    df_out = df[mask]

    return df_out

In [28]:
mask = (df_test["action_type"] == "clickout item") | (df_test["action_type"] == "interaction item rating") | (df_test["action_type"] == "interaction item image") | (df_test["action_type"] == "interaction item deals")
df_test = df_test[mask]

In [29]:
df_recent = create_recent_index(df_test)
df_recent

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices,all_interactions
0,03P4VFKK12UO,325fafb5fa450,1541107435,17,interaction item image,65685,US,"Bakersfield, USA",desktop,,,,65685
1,03P4VFKK12UO,325fafb5fa450,1541107435,18,interaction item image,65685,US,"Bakersfield, USA",desktop,,,,65685
2,03P4VFKK12UO,325fafb5fa450,1541107457,19,interaction item image,65685,US,"Bakersfield, USA",desktop,,,,65685
3,03P4VFKK12UO,325fafb5fa450,1541107457,20,interaction item image,65685,US,"Bakersfield, USA",desktop,,,,65685
4,03P4VFKK12UO,325fafb5fa450,1541107467,21,interaction item image,65685,US,"Bakersfield, USA",desktop,,,,65685
5,03P4VFKK12UO,325fafb5fa450,1541107467,22,interaction item image,65685,US,"Bakersfield, USA",desktop,,,,65685
6,03P4VFKK12UO,325fafb5fa450,1541107467,23,interaction item image,65685,US,"Bakersfield, USA",desktop,,,,65685
7,03P4VFKK12UO,325fafb5fa450,1541107467,24,interaction item image,65685,US,"Bakersfield, USA",desktop,,,,65685
8,03P4VFKK12UO,325fafb5fa450,1541107467,25,interaction item image,65685,US,"Bakersfield, USA",desktop,,,,65685
9,03P4VFKK12UO,325fafb5fa450,1541107467,26,interaction item image,65685,US,"Bakersfield, USA",desktop,,,,65685


In [30]:
df_sub = get_submission_target(df_recent)

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices,all_interactions
38,03P4VFKK12UO,325fafb5fa450,1541107538,55,clickout item,,US,"Bakersfield, USA",desktop,Very Good Rating|5 Star|4 Star|Hotel|Motel|Res...,1306936|56482|2842358|6881276|65685|63259|6539...,178|104|110|94|57|96|46|61|48|35|50|38|59|44|5...,65685
39,06S61EKCW1JY,22b9deb2da8f7,1541075258,1,clickout item,,FR,"Alicante, Spain",tablet,,6721|6724|40109|147227|80983|6719|40718|119405...,36|45|99|48|49|50|49|56|40|63|35|95|40|78|132|...,
80,09L0Y03JYTAC,91de23da01b00,1541034697,34,clickout item,,US,"Las Vegas, USA",mobile,,8561|8564|8568|8586|8588|102080|8613|55677|570...,17|37|29|35|97|23|31|62|31|82|52|170|109|97|12...,8561
81,0NN8D1GWBW1F,84316dcbfb8e3,1541042711,1,clickout item,,IN,"Bengaluru, India",tablet,,9164598|1400686|2216204|2817576|4775794|568427...,26|22|17|9|21|16|23|24|22|19|79|24|20|19|24|21...,
84,0O1CTEP95YKQ,af793494b5f84,1541113537,22,clickout item,,CO,"Las Vegas, USA",mobile,Breakfast Included|Sort by Price,69086|8533|925367|58949|61049|8581|8517|77520|...,59|66|78|86|87|90|90|92|93|98|99|100|106|108|1...,63051 2851635
87,0T819ODEIRA4,7643893620070,1541085062,7,clickout item,,CH,"Mühldorf am Inn, Germany",desktop,,162011|52319|5170834|9332768|6398694|1830581|5...,109|110|109|90|94|149|85|108|115|91|90|79|75|1...,5170834
88,0VNR91BTRLCP,0c086494b8b9b,1541062145,1,clickout item,,BE,"Voeren, Belgium",mobile,,1204316|2140652|107674|1072440|2555490|2181166...,100|84|49|80|53|100|77|99|111|95|99|82|174|75|...,
107,0X3OMZZH37ZS,c0cdade1ed014,1541085912,8,clickout item,,NL,"Souillac, France",desktop,,1556793|434856|3206344|49682|1358429|658821|10...,64|83|65|82|63|71|96|105|60|87|66|62|85|65|97|...,434856 1358429
108,1GHT8M98OWVP,c0b179075aebb,1541088730,1,clickout item,,SE,"Hudiksvall, Sweden",desktop,,2467772|226906|41426|226911|7110436|7095524|26...,96|159|145|105|145|144|155|41,
126,1V0LB3004RD7,7e95047bee063,1541076518,2,clickout item,,AU,"Bowen, Australia",mobile,,3940430|2208330|2730322|1424951|2597232|259525...,60|100|75|82|164|63|65|72|67|82|103|85|69|49|8...,


In [80]:
df_predicted = df_sub[df_sub['all_interactions'] != '']
df_not_predicted = df_sub[df_sub['all_interactions'] == '']

In [75]:
df_not_predicted = complete_prediction(df_not_predicted)
df_not_predicted.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,user_id,session_id,timestamp,step,item_recommendations
39,06S61EKCW1JY,22b9deb2da8f7,1541075258,1,6721 6724 40109 147227 80983 6719 40718 119405...
81,0NN8D1GWBW1F,84316dcbfb8e3,1541042711,1,9164598 1400686 2216204 2817576 4775794 568427...
88,0VNR91BTRLCP,0c086494b8b9b,1541062145,1,1204316 2140652 107674 1072440 2555490 2181166...
108,1GHT8M98OWVP,c0b179075aebb,1541088730,1,2467772 226906 41426 226911 7110436 7095524 26...
126,1V0LB3004RD7,7e95047bee063,1541076518,2,3940430 2208330 2730322 1424951 2597232 259525...


In [41]:
def complete_prediction(df_test_nation):
    df_test_nation['item_recommendations'] = df_test_nation.apply(lambda x: fill_recs(x.impressions), axis=1)
    #print('No prediction for #' + str(df_missed.shape[0]) + 'items')
    df_out_nation = df_test_nation[['user_id', 'session_id', 'timestamp','step', 'item_recommendations']]
    return df_out_nation

def list_to_space_string(l):
    """Return a space separated string from a list"""
    s = " ".join(l)
    return s

def fill_recs(imp):
    l = imp.split('|')
    return list_to_space_string(l)

In [79]:
def complete_prediction_recent(df):
    df['item_recommendations'] = df.apply(lambda x: concat_interactions(x), axis=1)
    df_out = df[['user_id', 'session_id', 'timestamp','step', 'item_recommendations']]
    return df_out

def concat_interactions(row):
    spl_int = row.all_interactions.split(' ')
    spl_imp = row.impressions.split('|')
    result = []
    for i in spl_int:
        result.append(str(i))
    for i in spl_imp:
        if(str(i) not in result):
            result.append(str(i))
    return " ".join(result)

In [81]:
df_predicted = complete_prediction_recent(df_predicted)
df_predicted

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,user_id,session_id,timestamp,step,item_recommendations
38,03P4VFKK12UO,325fafb5fa450,1541107538,55,65685 1306936 56482 2842358 6881276 63259 6539...
80,09L0Y03JYTAC,91de23da01b00,1541034697,34,8561 8564 8568 8586 8588 102080 8613 55677 570...
84,0O1CTEP95YKQ,af793494b5f84,1541113537,22,63051 2851635 69086 8533 925367 58949 61049 85...
87,0T819ODEIRA4,7643893620070,1541085062,7,5170834 162011 52319 9332768 6398694 1830581 5...
107,0X3OMZZH37ZS,c0cdade1ed014,1541085912,8,434856 1358429 1556793 3206344 49682 658821 10...
138,2WEN1MLMALJM,d89f82d2b2538,1541069999,9,3508086 4050606 1632525 8404976 9169816 194295...
143,3CB5DVB0CC0S,776a6ed9cf3b7,1541075304,2,8186812 1979173 1552971 2297072 10029684 19218...
157,3WK1Z5XPGA43,48125fbfd5e9e,1541086351,15,9049620 3382114 4581320 3971456 1249659 571469...
163,4GVU9AT4RVBW,b42f4cd99975e,1541110335,8,42161 1890831 33166 42268 42104 42068 16382 42...
281,5BR973YOX5QD,a24a3d2fb4543,1541081113,115,514881 1627077 513366 1496061 479176 1708467 8...


In [None]:
df_out = pd.concat([df_predicted, ], ignore_index=True, sort=False)
