# Initialization

In [1]:
try:
    # excecuting in Colab
    from google.colab import drive, runtime
    drive.mount('/content/gdrive', force_remount=True)
    using_colab = True
    root_dir = '/content/gdrive/MyDrive/oc_projet_9'

    !pip install scikit-surprise


except:
    # excecuting locally
    using_colab = False
    root_dir = 'D:/OpenClassrooms/projet_9'

Mounted at /content/gdrive
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 KB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp38-cp38-linux_x86_64.whl size=3366452 sha256=6d0e8fcd41a3cc4a7bf1678263fdf6542070700f432847fbd6241b1542b7224a
  Stored in directory: /root/.cache/pip/wheels/af/db/86/2c18183a80ba05da35bf0fb7417aac5cddbd93bcb1b92fd3ea
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.3


In [2]:
import os

import numpy as np
import pandas as pd

if using_colab:
    import gdrive.MyDrive.oc_projet_9.scripts.project_9_functions as pf
else:
    import project_9_functions as pf

from IPython.display import clear_output

In [3]:
if using_colab:
    # colab paths
    clicks_sample_path = 'clicks_sample.csv'
    articles_metadata_path = 'articles_metadata.csv'
    articles_embeddings_path = 'articles_embeddings.pickle'
    clicks_dir_path = 'clicks/'
    clicks_file = 'clicks_hour_'

    zip_path = root_dir + '/data/' + 'news-portal-user-interactions-by-globocom.zip'
    zip_clicks_path = 'clicks.zip'
    path_list = [clicks_sample_path, articles_metadata_path, articles_embeddings_path, clicks_dir_path]

    already_unziped = True
    for path in path_list:
        if not os.path.exists(path):
            print(path)
            already_unziped = False

    if already_unziped:
        print('data already unziped')
    else:
        !unzip $zip_path
        !unzip $zip_clicks_path
        clear_output()


else:
    # local paths
    clicks_sample_path = root_dir + '/data/' + 'clicks_sample.csv'
    articles_metadata_path = root_dir + '/data/' + 'articles_metadata.csv'
    articles_embeddings_path = root_dir + '/data/' + 'articles_embeddings.pickle'
    clicks_dir_path = root_dir + '/data/clicks/'
    clicks_file = 'clicks_hour_'

In [4]:
test_mode = False

### articles metadata

In [5]:
articles_metadata = pd.read_csv(articles_metadata_path)
articles_metadata

Unnamed: 0,article_id,category_id,created_at_ts,publisher_id,words_count
0,0,0,1513144419000,0,168
1,1,1,1405341936000,0,189
2,2,1,1408667706000,0,250
3,3,1,1408468313000,0,230
4,4,1,1407071171000,0,162
...,...,...,...,...,...
364042,364042,460,1434034118000,0,144
364043,364043,460,1434148472000,0,463
364044,364044,460,1457974279000,0,177
364045,364045,460,1515964737000,0,126


### clicks

In [6]:
clicks = pf.get_clicks_df(test_mode, clicks_sample_path, clicks_dir_path)
clicks

Unnamed: 0,user_id,session_id,session_start,session_size,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
0,0,1506825423271737,1506825423000,2,157541,1506826828020,4,3,20,1,20,2
1,0,1506825423271737,1506825423000,2,68866,1506826858020,4,3,20,1,20,2
2,1,1506825426267738,1506825426000,2,235840,1506827017951,4,1,17,1,16,2
3,1,1506825426267738,1506825426000,2,96663,1506827047951,4,1,17,1,16,2
4,2,1506825435299739,1506825435000,2,119592,1506827090575,4,1,17,1,24,2
...,...,...,...,...,...,...,...,...,...,...,...,...
2564,10051,1508211372158328,1508211372000,2,84911,1508211557302,4,3,2,1,25,1
2565,322896,1508211376302329,1508211376000,2,30760,1508211672520,4,1,17,1,25,2
2566,322896,1508211376302329,1508211376000,2,157507,1508211702520,4,1,17,1,25,2
2567,123718,1508211379189330,1508211379000,2,234481,1508211513583,4,3,2,1,25,2


### articles embeddings

In [7]:
n_samples = 1000

articles_embeddings = np.load(articles_embeddings_path, allow_pickle=True)
if test_mode:
    articles_embeddings = articles_embeddings[:n_samples,:]

print(articles_embeddings.shape)
print(type(articles_embeddings[0,0]))

(364047, 250)
<class 'numpy.float32'>


### surprise

In [8]:
df = clicks.merge(articles_metadata, left_on='click_article_id', right_on='article_id')
df = df[['user_id', 'category_id', 'article_id']]
df

Unnamed: 0,user_id,category_id,article_id
0,0,281,157541
1,20,281,157541
2,44,281,157541
3,45,281,157541
4,76,281,157541
...,...,...,...
2988176,195186,1,2221
2988177,75658,399,271117
2988178,217129,9,20204
2988179,217129,136,70196


In [9]:
to_keep = ['user_id', 'article_id']

X = df.groupby(to_keep).size()
X = X.to_frame().reset_index()
X = X.rename(columns={0:'n_clicks'})
X

Unnamed: 0,user_id,article_id,n_clicks
0,0,68866,1
1,0,87205,1
2,0,87224,1
3,0,96755,1
4,0,157541,1
...,...,...,...
2950705,322894,168401,1
2950706,322895,63746,1
2950707,322895,289197,1
2950708,322896,30760,1


In [10]:
from surprise import Reader, Dataset

click_range = (0, X['n_clicks'].max())
print(click_range)

reader = Reader(rating_scale=click_range)
data = Dataset.load_from_df(X, reader)

(0, 33)


In [11]:
from surprise.model_selection import train_test_split, cross_validate

train, val = train_test_split(data, test_size=0.2)
print('train:', len(X) - len(val))
print('  val:', len(val))

train: 2360568
  val: 590142


### baseline

In [12]:
from surprise.prediction_algorithms.random_pred import NormalPredictor
model_baseline = NormalPredictor()
model_baseline.fit(train)

<surprise.prediction_algorithms.random_pred.NormalPredictor at 0x7fb6e7b10cd0>

In [13]:
pred_baseline = model_baseline.test(val)

from surprise import accuracy
loss_baseline = accuracy.rmse(pred_baseline)

RMSE: 0.1963


### SVD model

In [14]:
from surprise import SVD
model = SVD()
model.fit(train)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fb6e7bb1a00>

In [15]:
pred = model.test(val)
loss = accuracy.rmse(pred)

RMSE: 0.1470


In [None]:
preds = model.test(val)
print(preds[0])
uid, iid, r_ui, est, details = preds[0]

user: 129096     item: 293218     r_ui = 1.00   est = 1.01   {'was_impossible': False}


### cross validation

In [16]:
from surprise.model_selection import cross_validate
cross_validate(model, data, measures=['RMSE'], cv=5, verbose=True)

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.1393  0.1410  0.1293  0.1300  0.1420  0.1363  0.0055  
Fit time          37.05   38.79   38.21   38.11   40.03   38.44   0.97    
Test time         6.46    6.58    5.94    6.56    6.83    6.47    0.29    


{'test_rmse': array([0.1393092 , 0.14095016, 0.1292859 , 0.12998094, 0.14201357]),
 'fit_time': (37.05434823036194,
  38.788525104522705,
  38.206127643585205,
  38.10946249961853,
  40.029484033584595),
 'test_time': (6.456169366836548,
  6.577569484710693,
  5.938835382461548,
  6.5642430782318115,
  6.825873613357544)}

### grid search

In [17]:
from surprise.model_selection import GridSearchCV

param_grid = {"lr_all": [0.001, 0.01, 0.1], "reg_all": [0, 0.4, 0.8]}
gs = GridSearchCV(SVD, param_grid, measures=["rmse"], cv=3)

gs.fit(data)

print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

0.13219454564556693
{'lr_all': 0.01, 'reg_all': 0.8}


In [21]:
kill_session = False

if kill_session and using_colab:
    # kill colab session
    runtime.unassign()