In [85]:
import pandas as pd
import numpy as np
import os
from surprise import Dataset, Reader, SVD, accuracy, Reader,SVDpp
from surprise.model_selection import cross_validate,GridSearchCV
import datetime

In [86]:
train = pd.read_csv('./Data/train.csv')
challenge = pd.read_csv('./Data/challenge_data.csv')
test = pd.read_csv('./Data/test.csv')

In [87]:
train.tail()

Unnamed: 0,user_sequence,user_id,challenge_sequence,challenge
903911,113839_9,113839,9,CI24992
903912,113839_10,113839,10,CI28586
903913,113839_11,113839,11,CI28610
903914,113839_12,113839,12,CI26388
903915,113839_13,113839,13,CI28630


In [88]:
challenge['total_submissions']=np.where(challenge['total_submissions'].isna(),np.median(challenge['total_submissions'].dropna()),challenge['total_submissions'])
cols_to_one_hot=['programming_language','author_gender']

for c in cols_to_one_hot:
    dummies=pd.get_dummies(challenge[c],prefix=c,drop_first=True)
    challenge=pd.concat([challenge,dummies],axis=1).drop(c,axis=1)

challenge['challenge_series_ID']=np.where(challenge['challenge_series_ID']==np.nan,'SI2652',challenge['challenge_series_ID'])

challenge['publish_date']=pd.to_datetime(challenge['publish_date'])
challenge['how_old']=challenge['publish_date'].apply(lambda d: (datetime.datetime.today()-d).days)

challenge.drop(['publish_date','author_ID','author_org_ID','category_id'],axis=1,inplace=True)

from category_encoders.hashing import HashingEncoder

challenge=HashingEncoder(cols=['challenge_series_ID'],n_components=32).fit_transform(challenge)

In [5]:
test=test[['user_id','challenge','challenge_sequence']]

In [6]:
recode=dict((i,j) for i,j in zip(range(1,14),range(13,0,-1)))


In [7]:
train['challenge_sequence']=train['challenge_sequence'].map(recode)

In [8]:
reader = Reader(rating_scale=(1, 13))
data = Dataset.load_from_df(train[['user_id',  'challenge',  'challenge_sequence']], reader)

In [None]:
'''param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}
gs = GridSearchCV(SVDpp, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])'''

In [34]:
# Fitting SVD++ on train data

algo = SVDpp(n_epochs=10, lr_all=0.005, reg_all=0.4)
algo.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x7f121fb5ffd0>

In [38]:
# User, item vecs extracted from model

id2user=dict((id,userid) for id,userid in enumerate(train.user_id.unique()))
id2challenge=dict((id,userid) for id,userid in enumerate(train.challenge.unique()))

user_vecs={}
item_vecs={}

for id in range(len(algo.pu)):
    user_vecs[id2user[id]]=algo.pu[id]

for id in range(len(algo.qi)):
    item_vecs[id2challenge[id]]=algo.qi[id]

In [40]:
userdf = pd.DataFrame(user_vecs)
itemdf=pd.DataFrame(item_vecs)
print(userdf.shape,itemdf.shape)

(20, 69532) (20, 5348)


In [13]:
# Scored are obtained by taking dot product of user-item cevtors

scores=np.dot(np.transpose(np.array(userdf)),np.array(itemdf))

In [14]:
scoredf=pd.DataFrame(scores,columns=list(itemdf.columns))
scoredf['user_id']=list(userdf.columns)

In [15]:
del user_vecs,item_vecs,scores,algo

In [66]:
scoredf.head()

Unnamed: 0,CI23714,CI23855,CI24917,CI23663,CI23933,CI25135,CI23975,CI25126,CI24915,CI24957,...,CI28672,CI26373,CI28667,CI28664,CI25931,CI28657,CI28611,CI25967,CI27413,user_id
0,-0.154752,-0.190409,-0.312791,-0.480181,-0.236941,-0.106825,-0.466419,-0.003752,-0.141227,-0.00198,...,-0.011542,0.04964,0.060211,0.004813,0.004069,0.011366,-0.112581,-0.001927,0.026454,4576
1,-0.208036,-0.215908,-0.279098,-0.12804,-0.12984,-0.182648,-0.244329,-0.216447,-0.177333,-0.007434,...,0.07888,-0.001055,0.016805,-0.03485,0.013813,-0.007991,-0.018552,-0.068079,0.030854,4580
2,1.277583,0.850359,1.152478,0.878967,0.75591,0.823475,0.751756,0.685226,0.782328,-0.122566,...,-0.016654,-0.01984,0.045139,0.049607,-0.096285,0.116673,0.008976,0.215022,0.013566,4581
3,-0.029228,0.004789,-0.001619,0.064269,0.046579,-0.087337,0.05673,-0.130914,-0.007173,-0.11695,...,-0.046073,-0.023402,-0.002217,-0.021489,-0.034333,0.021543,-0.047488,0.028377,-0.000671,4582
4,-0.190503,-0.163927,-0.203107,-0.155737,-0.234652,-0.128566,-0.252554,-0.148412,-0.197548,-0.06476,...,0.04204,0.010675,-0.030556,0.033043,-0.013079,-0.037954,-0.117103,0.008401,0.034584,4585


In [16]:
# Fitting model on test set

reader = Reader(rating_scale=(1, 10))
recode=dict((i,j) for (i,j) in zip(range(1,11),range(10,0,-1)))
test['challenge_sequence']=test['challenge_sequence'].map(recode)
testdata = Dataset.load_from_df(test[['user_id',  'challenge',  'challenge_sequence']], reader)
algo = SVDpp(n_epochs=10, lr_all=0.005, reg_all=0.4)
algo.fit(testdata.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x7f12fc8689e8>

In [17]:
testid2user=dict((id,userid) for id,userid in enumerate(test.user_id.unique()))
testid2challenge=dict((id,userid) for id,userid in enumerate(test.challenge.unique()))

user_vecs={}
item_vecs={}

for id in range(len(algo.pu)):
    user_vecs[id2user[id]]=algo.pu[id]

for id in range(len(algo.qi)):
    item_vecs[id2challenge[id]]=algo.qi[id]
    
testuserdf = pd.DataFrame(user_vecs)
testitemdf=pd.DataFrame(item_vecs)

del user_vecs,item_vecs,algo

In [42]:
# We will find the users in train set that are closest to test set users

import faiss

cpu_index = faiss.IndexFlatL2(userdf.shape[0])
n_gpu = 1
print('Number of available GPUs: %d    Using: %d' % (faiss.get_num_gpus(), n_gpu))
co = faiss.GpuMultipleClonerOptions()
co.shard = True
gpu_index = faiss.index_cpu_to_all_gpus(cpu_index, co=co, ngpu=n_gpu)

Number of available GPUs: 1    Using: 1


In [44]:
userdf=userdf.astype('float32')
testuserdf=testuserdf.astype('float32')

In [45]:
import time

print('Adding dataset to index...')
t0 = time.time()    

gpu_index.add(np.transpose(np.array(userdf)))

elapsed = time.time() - t0
print('Building index took %.2f seconds' % (elapsed))

Adding dataset to index...
Building index took 0.02 seconds


In [48]:
def find_testuser_trainfriends(dat):
    friends=[]
    for i in range(dat.shape[0]):
        D, I = gpu_index.search(dat[i].reshape(1, userdf.shape[0]), k=1)
        friends.append(I[0, 0])
    return friends

def find_testuser_testfriends(dat):
    friends=[]
    for i in range(dat.shape[0]):
        D, I = gpu_index.search(dat[i].reshape(1, testuserdf.shape[0]), k=1)
        friends.append(I[0, 0])
    return friends

In [50]:
# We recommend top 3 highest rated items in the nearest user basket

test_train_friends=find_testuser_trainfriends(np.transpose(np.array(testuserdf)))
test2trainfriends={}
for i,f in enumerate(test_train_friends):
    test2trainfriends[testid2user[i]]=id2user[f]

In [52]:
# Making final predictions

userscores={}

for te_user,tr_user in test2trainfriends.items():    
    r=scoredf.loc[scoredf['user_id']==tr_user]
    idx=np.argsort(list(r)[:-1])[::-1]
    sorted_cols=scoredf.columns[idx]
    challenge_list=[c for c in sorted_cols if c not in list(test.challenge)][:3]
    userscores[te_user]=challenge_list

KeyboardInterrupt: 

In [None]:
resdf = pd.DataFrame({
      'user_sequence':[str(k)+'_'+str(i) for k in userscores.keys() for i in [1,2,3]]
      'challenge': [item for sublist in userscores.values() for item in sublist]
      })

resdf.head()