**Описание данных**

Файл `train_joke_df.csv` содержит:
- UID - id пользователей
- JID - id шуток, которые 
- Rating - рейтинг шутки, который проставил пользователь 


Рейтинг имеет значение от -10.00 до 10.00. Могут встречаться значения 99.00, но это обозначает Null (нет рейтинга от пользователя).

Метрика для оценки [RMSE](https://www.codecamp.ru/blog/how-to-interpret-rmse/)

Минимальный RMSE: `4.2238`



In [5]:
%pip install "scikit-surprise==1.1.3"
%pip install "xlrd==2.0.1"

Collecting scikit-surprise==1.1.3
  Using cached scikit-surprise-1.1.3.tar.gz (771 kB)
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py): started
  Building wheel for scikit-surprise (setup.py): finished with status 'done'
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp39-cp39-win_amd64.whl size=1142334 sha256=edeac306569a63fffe6e7354575b03b60bd6a29a01e89362b0c414ed208059d3
  Stored in directory: c:\users\sasha\appdata\local\pip\cache\wheels\c6\3a\46\9b17b3512bdf283c6cb84f59929cdd5199d4e754d596d22784
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.3
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### Import

In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
from surprise import Dataset, Reader, KNNWithMeans, accuracy, SVDpp
from surprise.model_selection import GridSearchCV
from surprise.model_selection import train_test_split
from sklearn.model_selection import train_test_split as tts
from surprise.model_selection import KFold

np.random.seed(42)

### Базовые функции для скоринга и получения рекомендаций

In [2]:
def get_num_user_ratings(uid):
    try:
        return len(trainset.ur[trainset.to_inner_uid(uid)])
    except ValueError: # пользователя не было во время обучения (новый, отправить на стартовые рекомендации)
        return 0
    
def get_num_item_ratings(iid):
    try: 
        return len(trainset.ir[trainset.to_inner_iid(iid)])
    except ValueError:
        return 0
    
def get_top_n(predictions, n=5):
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n    

### Загрузка и обработка данных

In [3]:
df = pd.read_csv('train_joke_df.csv')

df.head(5)

Unnamed: 0,UID,JID,Rating
0,18029,6,-1.26
1,3298,64,-4.17
2,3366,58,0.92
3,12735,92,3.69
4,11365,38,-6.6


In [4]:
df = df.sort_values(by=['UID', 'JID'])
df = df.reset_index(drop=True)

In [5]:
reader = Reader(rating_scale=(-10, 10))
data = Dataset.load_from_df(df[['UID', 'JID', 'Rating']], reader)

In [6]:
trainset_data = data.build_full_trainset()
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

### Обучение модели

In [8]:
sim_options = {
    "name": ["msd", "cosine"], # способы оценки похожести (в GridSearch)
    "min_support": [1, 2],     # минимальное кол-во общих пользоватлей с данной шуткой
    "user_based": [False],     # поиск "похожести" будет на основе шуток, а не пользователей
}

param_grid = {"sim_options": sim_options}

gs = GridSearchCV(KNNBaseline, param_grid, measures=["rmse", "mae"], cv=2)
gs.fit(data)
     
# результат
print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x22c4f586490>

In [21]:
algo = gs.best_estimator['rmse']
algo.fit(trainset)
predictions = algo.test(testset)
accuracy.rmse(predictions)

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 3.1658


3.1658331408404443

In [9]:
predictions = svd.test(testset)
accuracy.rmse(predictions)

RMSE: 4.8093


4.809291632393172

### Тестирование и результаты

In [22]:
uid = 1  
iid = 1
pred = algo.predict(uid, iid, r_ui=-7.82, verbose=True)

user: 1          item: 1          r_ui = -7.82   est = -3.05   {'actual_k': 40, 'was_impossible': False}


In [23]:
uid = 24983
iid = 62
pred = algo.predict(uid, iid, r_ui=-0.29, verbose=True)

user: 24983      item: 62         r_ui = -0.29   est = 5.22   {'actual_k': 40, 'was_impossible': False}


### Обзор рекомендаций

In [24]:
trainset = algo.trainset

predictions_df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])

predictions_df['№ кол-во пользовательских рейтингов'] = predictions_df.uid.apply(get_num_user_ratings)
predictions_df['№ кол-во рейтингов элементов'] = predictions_df.iid.apply(get_num_item_ratings)
predictions_df['error'] = abs(predictions_df.est - predictions_df.rui)

best_predictions = predictions_df.sort_values(by='error')[:10]
worst_predictions = predictions_df.sort_values(by='error')[-10:]

In [25]:
best_predictions.head(5)

Unnamed: 0,uid,iid,rui,est,details,№ кол-во пользовательских рейтингов,№ кол-во рейтингов элементов,error
1047740,11935,64,0.881123,0.881124,"{'actual_k': 40, 'was_impossible': False}",42,11146,1e-06
561033,23286,90,0.881123,0.881118,"{'actual_k': 40, 'was_impossible': False}",64,6204,6e-06
720711,4287,59,0.881123,0.881131,"{'actual_k': 40, 'was_impossible': False}",64,11579,8e-06
427699,8140,73,0.881123,0.881134,"{'actual_k': 40, 'was_impossible': False}",55,5458,1e-05
904360,22225,57,0.881123,0.881135,"{'actual_k': 25, 'was_impossible': False}",38,10208,1.1e-05


In [26]:
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

top_n = get_top_n(predictions)

a=0
for uid, user_ratings in top_n.items():
    a+=1
    print(uid, [iid for (iid, _) in user_ratings])
    
    if a==10:
        break

19208 [89, 29, 62, 54, 35]
8671 [32, 27, 49, 68, 72]
6037 [89, 36, 27, 49, 72]
3233 [50, 36, 27, 31, 48]
3449 [50, 89, 53, 49, 72]
10032 [32, 35, 29, 53, 49]
5774 [89, 32, 29, 68, 69]
23392 [89, 36, 62, 35, 29]
3039 [89, 36, 62, 27, 53]
17395 [50, 89, 27, 35, 29]


### Для отправки на тестирование

In [10]:
test = pd.read_csv('test_joke_df_nofactrating.csv', index_col=0)
test.head(5)

Unnamed: 0_level_0,UID,JID
InteractionID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,11228,39
1,21724,85
2,16782,56
3,12105,42
4,14427,2


In [11]:
test['Rating'] = test[['UID', 'JID']].apply(lambda x: svd.predict(x[0], x[1], verbose=False).est,
                                                      axis = 1)

In [12]:
test['Rating'].to_frame().head(5)

Unnamed: 0_level_0,Rating
InteractionID,Unnamed: 1_level_1
0,4.222388
1,-8.12776
2,-1.41854
3,6.361503
4,0.560133


In [13]:
test['Rating'].to_frame().to_csv('submission1.csv')