In [2]:
import pandas as pd
import numpy as np
from lightfm import LightFM
from lightfm.evaluation import precision_at_k, auc_score
from scipy import sparse
from datetime import datetime
from catboost import CatBoostRegressor

In [3]:
als_train_u0_m0 = pd.read_csv('./ALS/ratings_train_with_actor_ALS_predictions_u_0_m_0.csv')
als_train_u0_m1 = pd.read_csv('./ALS/ratings_train_with_actor_ALS_predictions_u_0_m_1.csv')
als_train_u0_m2 = pd.read_csv('./ALS/ratings_train_with_actor_ALS_predictions_u_0_m_2.csv')
als_train_u1_m0 = pd.read_csv('./ALS/ratings_train_with_actor_ALS_predictions_u_1_m_0.csv')
als_train_u1_m1 = pd.read_csv('./ALS/ratings_train_with_actor_ALS_predictions_u_1_m_1.csv')
als_train_u1_m2 = pd.read_csv('./ALS/ratings_train_with_actor_ALS_predictions_u_1_m_2.csv')
als_train_u2_m0 = pd.read_csv('./ALS/ratings_train_with_actor_ALS_predictions_u_2_m_0.csv')
als_train_u2_m1 = pd.read_csv('./ALS/ratings_train_with_actor_ALS_predictions_u_2_m_1.csv')
als_train_u2_m2 = pd.read_csv('./ALS/ratings_train_with_actor_ALS_predictions_u_2_m_2.csv')

als_test = pd.read_csv('./ALS/ratings_dev_with_actor_ALS_predictions.csv')
als_final_test = pd.read_csv('./ALS/ratings_test_with_actor_ALS_predictions.csv')

In [4]:
als_train = pd.concat([als_train_u0_m0, als_train_u0_m1, als_train_u0_m2,
                       als_train_u1_m0, als_train_u1_m1, als_train_u1_m2,
                       als_train_u2_m0, als_train_u2_m1, als_train_u2_m2],
                      ignore_index=True)

In [5]:
print(als_train.shape)
print(als_test.shape)
print(als_train.head(2))
print(als_test.head(2))
print(als_final_test.shape)
print(als_final_test.head(2))

(9964326, 7)
(1168200, 7)
   userId  movieId   timestamp  rating  user_movie_ALS  user_genre_ALS  \
0   12186     1923   940188307     4.0        3.808532        3.607751   
1   33014     4749  1106468166     3.0        2.197552        3.060517   

   user_actor_ALS  
0        3.698722  
1        2.165514  
   userId  movieId   timestamp  rating  user_movie_ALS  user_genre_ALS  \
0   42239     1245  1329334058     5.0        4.510603        4.660654   
1   11201     5784  1300834741     4.0        2.953015        3.836220   

   user_actor_ALS  
0        4.581913  
1        3.343717  
(2615279, 6)
   userId  movieId   timestamp  user_movie_ALS  user_genre_ALS  user_actor_ALS
0  117987    59615  1216381557        3.569524        3.787850        3.930970
1   93630     1608   948080513        3.513056        3.661569        3.385922


In [6]:
def rmse_asymmetric(rating, prediction):
    bad_border = 4
    
    if rating < bad_border <= prediction:
        penalty_coefficient = 2.0
    else:
        penalty_coefficient = 1.0
    
    return penalty_coefficient*(rating - prediction)**2


def vector_rmse_asymmetric(df, true_col, pred_col, bad_border=4):
    data = np.array(df[[true_col, pred_col]].values)
    mask_1 = data[:, 0] < bad_border
    mask_2 = data[:, 1] >= bad_border
    penalty = data[mask_1 & mask_2]
    not_penalty = data[~mask_1 & mask_2]
    result = np.mean(np.concatenate((2 * (penalty[:, 0] - penalty[:, 1])  ** 2,
                                    (not_penalty[:, 0] - not_penalty[:, 1])  ** 2), 
                                    axis=None))
    return result

In [7]:
print(vector_rmse_asymmetric(als_train, 'rating', 'user_movie_ALS'))
print(vector_rmse_asymmetric(als_train, 'rating', 'user_genre_ALS'))
print(vector_rmse_asymmetric(als_train, 'rating', 'user_actor_ALS'))

  from ipykernel import kernelapp as app


0.6331189926247842
1.0729481938117227
0.7162128806061268


In [8]:
train_data = als_train[['user_movie_ALS', 'user_genre_ALS', 'user_actor_ALS']]
train_target = als_train['rating']

test_data = als_test[['user_movie_ALS', 'user_genre_ALS', 'user_actor_ALS']]
test_target = als_test['rating']

In [30]:
bad_border = 4
mask_1 = np.array(train_target.values) < bad_border
weights = np.ones(len(train_data))
weights[mask_1] = 2

In [31]:
weights

array([1., 2., 2., ..., 1., 1., 1.])

In [36]:
model = CatBoostRegressor(iterations=1000)

In [37]:
model.fit(train_data, train_target, verbose=True, sample_weight=weights)

0:	learn: 3.3032480	total: 931ms	remaining: 15m 30s
1:	learn: 3.2102149	total: 1.9s	remaining: 15m 49s
2:	learn: 3.1200715	total: 2.84s	remaining: 15m 44s
3:	learn: 3.0329479	total: 3.8s	remaining: 15m 45s
4:	learn: 2.9485340	total: 4.72s	remaining: 15m 39s
5:	learn: 2.8669138	total: 5.67s	remaining: 15m 40s
6:	learn: 2.7878849	total: 6.59s	remaining: 15m 35s
7:	learn: 2.7113708	total: 7.5s	remaining: 15m 29s
8:	learn: 2.6374130	total: 8.45s	remaining: 15m 30s
9:	learn: 2.5658725	total: 9.43s	remaining: 15m 33s
10:	learn: 2.4966011	total: 10.4s	remaining: 15m 33s
11:	learn: 2.4296346	total: 11.4s	remaining: 15m 37s
12:	learn: 2.3649595	total: 12.3s	remaining: 15m 34s
13:	learn: 2.3023200	total: 13.3s	remaining: 15m 35s
14:	learn: 2.2417700	total: 14.2s	remaining: 15m 31s
15:	learn: 2.1832717	total: 15.2s	remaining: 15m 35s
16:	learn: 2.1267316	total: 16.1s	remaining: 15m 33s
17:	learn: 2.0721445	total: 17.1s	remaining: 15m 33s
18:	learn: 2.0194496	total: 18.1s	remaining: 15m 32s
19:	le

154:	learn: 0.7941153	total: 2m 26s	remaining: 13m 17s
155:	learn: 0.7940756	total: 2m 27s	remaining: 13m 16s
156:	learn: 0.7940389	total: 2m 28s	remaining: 13m 15s
157:	learn: 0.7940029	total: 2m 29s	remaining: 13m 15s
158:	learn: 0.7939681	total: 2m 30s	remaining: 13m 14s
159:	learn: 0.7939366	total: 2m 31s	remaining: 13m 12s
160:	learn: 0.7939069	total: 2m 31s	remaining: 13m 12s
161:	learn: 0.7938773	total: 2m 32s	remaining: 13m 10s
162:	learn: 0.7938502	total: 2m 33s	remaining: 13m 9s
163:	learn: 0.7938234	total: 2m 34s	remaining: 13m 9s
164:	learn: 0.7937972	total: 2m 35s	remaining: 13m 8s
165:	learn: 0.7937735	total: 2m 36s	remaining: 13m 7s
166:	learn: 0.7937517	total: 2m 37s	remaining: 13m 6s
167:	learn: 0.7937314	total: 2m 38s	remaining: 13m 5s
168:	learn: 0.7937115	total: 2m 39s	remaining: 13m 6s
169:	learn: 0.7936907	total: 2m 40s	remaining: 13m 5s
170:	learn: 0.7936728	total: 2m 41s	remaining: 13m 4s
171:	learn: 0.7936547	total: 2m 42s	remaining: 13m 3s
172:	learn: 0.793638

305:	learn: 0.7931670	total: 5m 11s	remaining: 11m 45s
306:	learn: 0.7931665	total: 5m 12s	remaining: 11m 44s
307:	learn: 0.7931656	total: 5m 13s	remaining: 11m 43s
308:	learn: 0.7931649	total: 5m 14s	remaining: 11m 43s
309:	learn: 0.7931639	total: 5m 15s	remaining: 11m 42s
310:	learn: 0.7931627	total: 5m 16s	remaining: 11m 41s
311:	learn: 0.7931618	total: 5m 17s	remaining: 11m 40s
312:	learn: 0.7931610	total: 5m 18s	remaining: 11m 39s
313:	learn: 0.7931600	total: 5m 19s	remaining: 11m 38s
314:	learn: 0.7931590	total: 5m 20s	remaining: 11m 37s
315:	learn: 0.7931579	total: 5m 21s	remaining: 11m 36s
316:	learn: 0.7931570	total: 5m 22s	remaining: 11m 35s
317:	learn: 0.7931561	total: 5m 23s	remaining: 11m 34s
318:	learn: 0.7931554	total: 5m 24s	remaining: 11m 33s
319:	learn: 0.7931545	total: 5m 26s	remaining: 11m 32s
320:	learn: 0.7931535	total: 5m 27s	remaining: 11m 32s
321:	learn: 0.7931522	total: 5m 28s	remaining: 11m 31s
322:	learn: 0.7931513	total: 5m 29s	remaining: 11m 30s
323:	learn

456:	learn: 0.7930674	total: 8m 4s	remaining: 9m 35s
457:	learn: 0.7930670	total: 8m 5s	remaining: 9m 34s
458:	learn: 0.7930665	total: 8m 6s	remaining: 9m 33s
459:	learn: 0.7930661	total: 8m 7s	remaining: 9m 32s
460:	learn: 0.7930656	total: 8m 8s	remaining: 9m 31s
461:	learn: 0.7930652	total: 8m 9s	remaining: 9m 30s
462:	learn: 0.7930647	total: 8m 10s	remaining: 9m 29s
463:	learn: 0.7930644	total: 8m 12s	remaining: 9m 28s
464:	learn: 0.7930640	total: 8m 13s	remaining: 9m 27s
465:	learn: 0.7930637	total: 8m 14s	remaining: 9m 26s
466:	learn: 0.7930631	total: 8m 15s	remaining: 9m 25s
467:	learn: 0.7930627	total: 8m 16s	remaining: 9m 24s
468:	learn: 0.7930623	total: 8m 17s	remaining: 9m 23s
469:	learn: 0.7930619	total: 8m 18s	remaining: 9m 22s
470:	learn: 0.7930616	total: 8m 19s	remaining: 9m 21s
471:	learn: 0.7930611	total: 8m 21s	remaining: 9m 20s
472:	learn: 0.7930606	total: 8m 22s	remaining: 9m 19s
473:	learn: 0.7930602	total: 8m 23s	remaining: 9m 18s
474:	learn: 0.7930597	total: 8m 24

608:	learn: 0.7930139	total: 10m 55s	remaining: 7m
609:	learn: 0.7930137	total: 10m 56s	remaining: 6m 59s
610:	learn: 0.7930133	total: 10m 57s	remaining: 6m 58s
611:	learn: 0.7930129	total: 10m 58s	remaining: 6m 57s
612:	learn: 0.7930126	total: 11m	remaining: 6m 56s
613:	learn: 0.7930123	total: 11m 1s	remaining: 6m 55s
614:	learn: 0.7930119	total: 11m 2s	remaining: 6m 54s
615:	learn: 0.7930116	total: 11m 3s	remaining: 6m 53s
616:	learn: 0.7930113	total: 11m 5s	remaining: 6m 52s
617:	learn: 0.7930111	total: 11m 6s	remaining: 6m 51s
618:	learn: 0.7930108	total: 11m 7s	remaining: 6m 50s
619:	learn: 0.7930106	total: 11m 8s	remaining: 6m 49s
620:	learn: 0.7930103	total: 11m 9s	remaining: 6m 48s
621:	learn: 0.7930100	total: 11m 10s	remaining: 6m 47s
622:	learn: 0.7930096	total: 11m 11s	remaining: 6m 46s
623:	learn: 0.7930092	total: 11m 13s	remaining: 6m 45s
624:	learn: 0.7930090	total: 11m 14s	remaining: 6m 44s
625:	learn: 0.7930087	total: 11m 15s	remaining: 6m 43s
626:	learn: 0.7930083	tota

759:	learn: 0.7929739	total: 13m 44s	remaining: 4m 20s
760:	learn: 0.7929736	total: 13m 45s	remaining: 4m 19s
761:	learn: 0.7929734	total: 13m 47s	remaining: 4m 18s
762:	learn: 0.7929732	total: 13m 48s	remaining: 4m 17s
763:	learn: 0.7929730	total: 13m 49s	remaining: 4m 16s
764:	learn: 0.7929728	total: 13m 50s	remaining: 4m 15s
765:	learn: 0.7929725	total: 13m 52s	remaining: 4m 14s
766:	learn: 0.7929722	total: 13m 53s	remaining: 4m 13s
767:	learn: 0.7929719	total: 13m 54s	remaining: 4m 12s
768:	learn: 0.7929717	total: 13m 55s	remaining: 4m 11s
769:	learn: 0.7929714	total: 13m 56s	remaining: 4m 9s
770:	learn: 0.7929712	total: 13m 58s	remaining: 4m 8s
771:	learn: 0.7929709	total: 13m 59s	remaining: 4m 7s
772:	learn: 0.7929707	total: 14m	remaining: 4m 6s
773:	learn: 0.7929705	total: 14m 1s	remaining: 4m 5s
774:	learn: 0.7929703	total: 14m 3s	remaining: 4m 4s
775:	learn: 0.7929700	total: 14m 4s	remaining: 4m 3s
776:	learn: 0.7929698	total: 14m 5s	remaining: 4m 2s
777:	learn: 0.7929696	tota

910:	learn: 0.7929445	total: 16m 50s	remaining: 1m 38s
911:	learn: 0.7929443	total: 16m 51s	remaining: 1m 37s
912:	learn: 0.7929441	total: 16m 52s	remaining: 1m 36s
913:	learn: 0.7929439	total: 16m 54s	remaining: 1m 35s
914:	learn: 0.7929438	total: 16m 55s	remaining: 1m 34s
915:	learn: 0.7929436	total: 16m 56s	remaining: 1m 33s
916:	learn: 0.7929435	total: 16m 58s	remaining: 1m 32s
917:	learn: 0.7929432	total: 16m 59s	remaining: 1m 31s
918:	learn: 0.7929431	total: 17m	remaining: 1m 29s
919:	learn: 0.7929430	total: 17m 1s	remaining: 1m 28s
920:	learn: 0.7929429	total: 17m 2s	remaining: 1m 27s
921:	learn: 0.7929426	total: 17m 4s	remaining: 1m 26s
922:	learn: 0.7929424	total: 17m 5s	remaining: 1m 25s
923:	learn: 0.7929423	total: 17m 6s	remaining: 1m 24s
924:	learn: 0.7929421	total: 17m 8s	remaining: 1m 23s
925:	learn: 0.7929419	total: 17m 9s	remaining: 1m 22s
926:	learn: 0.7929418	total: 17m 10s	remaining: 1m 21s
927:	learn: 0.7929416	total: 17m 11s	remaining: 1m 20s
928:	learn: 0.7929415

<catboost.core.CatBoostRegressor at 0x183817e05c0>

In [38]:
catboost_predict = model.predict(test_data)

In [39]:
als_test['catboost_predict'] = catboost_predict

In [40]:
print(vector_rmse_asymmetric(als_test, 'rating', 'catboost_predict'))

0.6194681123003011


In [57]:
als_test['mean_predict'] = np.mean(test_data.values, axis=1)

In [59]:
print(vector_rmse_asymmetric(als_test, 'rating', 'mean_predict'))

0.6468945556568858


  from ipykernel import kernelapp as app


In [60]:
als_test['median_predict'] = np.median(test_data.values, axis=1)

  r = func(a, **kwargs)


In [61]:
print(vector_rmse_asymmetric(als_test, 'rating', 'median_predict'))

0.6599487127769957


  from ipykernel import kernelapp as app


In [62]:
final_test_results = model.predict(als_final_test[['user_movie_ALS', 'user_genre_ALS', 'user_actor_ALS']])

In [63]:
als_final_test['prediction'] = final_test_results

In [64]:
als_final_test.to_csv('./prediction.csv')

In [66]:
import pickle

In [76]:
with open('./saved_catboost_model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [77]:
model.feature_importances_

array([95.07619207,  1.04074984,  3.88305809])