In [14]:
%pip install "scikit-surprise==1.1.3"

Note: you may need to restart the kernel to use updated packages.


In [15]:
import numpy as np
import pandas as pd
from collections import defaultdict
from surprise import Dataset, Reader, KNNWithMeans, accuracy
from surprise.model_selection import GridSearchCV
from surprise.model_selection import train_test_split
from sklearn.model_selection import train_test_split as tts
from surprise.model_selection import KFold
from surprise import SVD, NMF
from surprise import KNNBaseline

np.random.seed(42)

In [16]:
df = pd.read_csv('train_joke_df.csv')
df = df[df["Rating"] != 99.0] 


df.head(5)

Unnamed: 0,UID,JID,Rating
0,18029,6,-1.26
1,3298,64,-4.17
2,3366,58,0.92
3,12735,92,3.69
4,11365,38,-6.6


In [17]:
# сделаем сортировку и перепишем index
df = df.sort_values(by=['UID', 'JID'])
df = df.reset_index(drop=True)

In [18]:
# создадим на основе набора данных
# поднабор, который требуется для библиотеки Surprise

# указываем минимальный и максимальный рейтинги
reader = Reader(rating_scale=(-10, 10))

# передаём набор, указывая последовательность колонок: user (raw) ids, item (raw) ids, ratings
# для Surprise - это обязательно
data = Dataset.load_from_df(df[['UID', 'JID', 'Rating']], reader)

In [84]:
trainset_data = data.build_full_trainset()

# сделаем разделение на обучающую и тестовую выборку
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

In [21]:
# определим набор данных для GridSearchCV
params = {'n_epochs': [120], 'lr_all': [0.0005], 'reg_all': [0.23]}       

grid_search = GridSearchCV(SVD, params, measures=['RMSE'])    
grid_search.fit(data)  

In [22]:
algo = grid_search.best_estimator['rmse']
algo.fit(trainset)    
predictions = algo.test(testset)
accuracy.rmse(predictions)

RMSE: 4.0382


4.038207909282675

In [23]:
def get_top_n(predictions, n=5):
    """Определят Топ-N рекомендаций

    Args:
        predictions(list of Prediction objects): Списко рекомендаций, из алгоритма Surprise
        n(int): Кол-во топ рекомендаций

    Returns:
        Словарь пользователь - список рекомендакиций для пользователей
        [(raw item id, rating estimation), ...]
    """

    # Предикт для каждого пользователя
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Сортировка предикта (по пользователям)
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n 

In [111]:
def create_ans_df():
    testset = pd.read_csv('test_joke_df_nofactrating.csv', index_col=0)
    predictions = test[['UID', 'JID']].apply(lambda x: algo.predict(x[0], x[1], verbose=False),axis = 1)
    top_n = get_top_n(predictions, n=10)

    arr_ans = []
    df_test = pd.DataFrame(columns=['UID', 'ANS'])
    for uid, user_ratings in top_n.items():
        recs = [iid for (iid, _) in user_ratings]
        line = list([[recs[0]], recs])
        ans = pd.DataFrame({'UID': uid, 'ANS': [line]})
        df_test = pd.concat([df_test, ans])
        arr_ans.append(line)


    return df_test

In [112]:
ans_df = create_ans_df()

In [113]:
ans_df.head(5)

Unnamed: 0,UID,ANS
0,11228,"[[29], [29, 49, 35, 100, 66, 5, 56, 39, 97, 59]]"
0,21724,"[[50], [50, 35, 69, 49, 44, 56, 60, 47, 8, 7]]"
0,16782,"[[15], [15, 8, 31, 100, 62, 42, 56, 40, 66, 45]]"
0,12105,"[[66], [66, 48, 42, 34, 15, 13]]"
0,14427,"[[30], [30, 28, 70, 43, 45, 2, 64, 26, 44, 54]]"


In [119]:
test_df = ans_df
test_df.reset_index(drop=True, inplace=True)
test_df['ANS'] = test_df['ANS'].astype('object')
test_df.to_csv('baseline_2_part.csv')

In [86]:
test = pd.read_csv('test_joke_df_nofactrating.csv', index_col=0)
test.head(5)

Unnamed: 0_level_0,UID,JID
InteractionID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,11228,39
1,21724,85
2,16782,56
3,12105,42
4,14427,2


In [87]:
test['Rating'] = test[['UID', 'JID']].apply(lambda x: algo.predict(x[0], x[1], verbose=False).est,
                                                      axis = 1)
                                                      


In [88]:
# формирование файла для отправки в Kaggle
test['Rating'].to_frame().to_csv('baseline.csv')