# Initialization

In [None]:
try:
    # excecuting in Colab
    from google.colab import drive, runtime
    drive.mount('/content/gdrive', force_remount=True)
    using_colab = True
    root_dir = '/content/gdrive/MyDrive/oc_projet_9'


except:
    # excecuting locally
    using_colab = False
    root_dir = 'D:/OpenClassrooms/projet_9'

Mounted at /content/gdrive


In [None]:
if using_colab:
    import gdrive.MyDrive.oc_projet_9.scripts.project_9_functions as pf
else:
    import project_9_functions as pf

import os

import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from IPython.display import clear_output

In [None]:
if using_colab:
    # colab paths
    clicks_sample_path = 'clicks_sample.csv'
    articles_metadata_path = 'articles_metadata.csv'
    articles_embeddings_path = 'articles_embeddings.pickle'
    clicks_dir_path = 'clicks/'
    clicks_file = 'clicks_hour_'

    zip_path = root_dir + '/data/' + 'news-portal-user-interactions-by-globocom.zip'
    zip_clicks_path = 'clicks.zip'
    path_list = [clicks_sample_path, articles_metadata_path, articles_embeddings_path, clicks_dir_path]

    already_unziped = True
    for path in path_list:
        if not os.path.exists(path):
            print(path)
            already_unziped = False

    if already_unziped:
        print('data already unziped')
    else:
        !unzip $zip_path
        !unzip $zip_clicks_path
        clear_output()


else:
    # local paths
    clicks_sample_path = root_dir + '/data/' + 'clicks_sample.csv'
    articles_metadata_path = root_dir + '/data/' + 'articles_metadata.csv'
    articles_embeddings_path = root_dir + '/data/' + 'articles_embeddings.pickle'
    clicks_dir_path = root_dir + '/data/clicks/'

data already unziped


In [None]:
test_mode = False

### clicks

In [None]:
clicks = pf.get_clicks_df(test_mode, clicks_sample_path, clicks_dir_path)
clicks

Unnamed: 0,user_id,session_id,session_start,session_size,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
0,0,1506825423271737,1506825423000,2,157541,1506826828020,4,3,20,1,20,2
1,0,1506825423271737,1506825423000,2,68866,1506826858020,4,3,20,1,20,2
2,1,1506825426267738,1506825426000,2,235840,1506827017951,4,1,17,1,16,2
3,1,1506825426267738,1506825426000,2,96663,1506827047951,4,1,17,1,16,2
4,2,1506825435299739,1506825435000,2,119592,1506827090575,4,1,17,1,24,2
...,...,...,...,...,...,...,...,...,...,...,...,...
2564,10051,1508211372158328,1508211372000,2,84911,1508211557302,4,3,2,1,25,1
2565,322896,1508211376302329,1508211376000,2,30760,1508211672520,4,1,17,1,25,2
2566,322896,1508211376302329,1508211376000,2,157507,1508211702520,4,1,17,1,25,2
2567,123718,1508211379189330,1508211379000,2,234481,1508211513583,4,3,2,1,25,2


### articles embeddings

In [None]:
embeddings = pf.get_embeddings(articles_embeddings_path, test_mode)
print(embeddings.shape)

(364047, 250)


### cosine similiarity

In [None]:
from numpy.linalg import norm
 
# define two lists or array
A = np.array([[2,1,2,3,2,9]])
B = np.array([3,4,2,4,5,5])
 
# compute cosine similarity
cosine = np.dot(A,B)/(norm(A)*norm(B))
print("Cosine Similarity:", cosine)

Cosine Similarity: [0.81885047]


In [None]:
A = [[0, 1, 0, 0, 1], 
    [0, 0, 1, 1, 1],
    [1, 1, 0, 1, 0]]

B = [[0, 1, 0, 0, 1], 
    [0, 0, 1, 1, 1],
    [1, 1, 0, 1, 0]]

similarities = cosine_similarity(A, B)
print(similarities.shape, '\n')
print(similarities)

(3, 3) 

[[1.         0.40824829 0.40824829]
 [0.40824829 1.         0.33333333]
 [0.40824829 0.33333333 1.        ]]


In [None]:
A = [[0, 0, 1, 1, 1]]

B = [[0, 1, 0, 0, 1], 
    [0, 0, 1, 1, 1],
    [1, 1, 0, 1, 0]]

similarities = cosine_similarity(A, B)
print(similarities.shape, '\n')
print(similarities)

(1, 3) 

[[0.40824829 1.         0.33333333]]


In [None]:
# cant use this method with embeddings: similarities array too big
'''
similarities = cosine_similarity(embeddings, embeddings)
print(similarities.shape)
'''

'\nsimilarities = cosine_similarity(embeddings, embeddings)\nprint(similarities.shape)\n'

In [None]:
# get the "n" elements most similar to the embedding of index "i"
i = 0
n = 5

res = cosine_similarity(embeddings[i,:].reshape(1, -1), embeddings)
print(res.shape, '\n')

top_n_indices = np.argsort(res, axis=1)[:,-n:]
top_n_values = res[:,top_n_indices]

print(top_n_indices)
print(top_n_values)

(1, 364047) 

[[77974 77610 77965 77608     0]]
[[[0.8822697  0.8859947  0.8881016  0.890307   0.99999994]]]


In [None]:
n = 5
last_seen = 509

res = cosine_similarity(embeddings[last_seen,:].reshape(1, -1), embeddings)
print(res.shape, '\n')
sorted_indices = np.argsort(res, axis=1)
top_n_indices = np.flip(sorted_indices[:,-n:])
top_n_values = res[:,top_n_indices]

# The most recommended article is always itself because its similarity is 1.
# Don't forget to exclude the article when computing similarities.
print(top_n_indices)
print('\ncosine_similarity:\n', np.round(top_n_values, 3))

(1, 364047) 

[[   509   8953    468 190524   1519]]

cosine_similarity:
 [[[1.    0.813 0.783 0.781 0.775]]]


In [None]:
# in a clean function

n = 5
method = 'last_seen'

params = {
    'last_seen': 509, 
    }

recs = pf.get_recommended_indices(n, embeddings, method, params)
print(recs.shape)
print(recs[0,:])

(1, 5)
[  8952    468 190523   1518   2971]


### get recommended indices for all articles

In [None]:
# Get recommended indices for all articles and save them in a file.
# The file will be used when calling the API.
%%time

if test_mode:
    n = 5
    method = 'last_seen'
    params = {}

    recs_list = []
    nb_articles = embeddings.shape[0]
    for i in range(nb_articles):
        params['last_seen'] = i
        recs_i = pf.get_recommended_indices(n, embeddings, method, params)
        recs_list.append(recs_i)

    recs_array = np.concatenate(recs_list, axis=0)
    print(recs_array.shape)

else:
    print('too slow need optimization')

too slow need optimization
CPU times: user 0 ns, sys: 243 µs, total: 243 µs
Wall time: 77 µs


In [None]:
def extract_with_indices(x, idx):
  return x[np.arange(x.shape[0])[:, None], idx]

In [None]:
# test to optimize previous cell (too slow)

if test_mode:
    n = 5
    params = {}

    chunk_size = 450


    nb_articles = embeddings.shape[0]
    assert chunk_size <= nb_articles - n, 'chunk_size too big, must be <= nb_articles - n'

    recs_list = []
    for i in range(0, nb_articles, chunk_size):
        try:
            j = i + chunk_size - 1
            _ = embeddings[j,:]
        except IndexError:
            j = nb_articles - 1
        print(f'{i} - {j}'.rjust(2*(1 + len(str(nb_articles)))), '\n')

        embedding_rows = embeddings[i:j+1,:]
        print(f'rows: {embedding_rows.shape}  '.ljust(18))

        chunk_indices = np.arange(i, j+1)
        embeddings_without_chunk = np.delete(embeddings, chunk_indices, axis=0)
        print(f'emb: {embeddings_without_chunk.shape}  '.ljust(18))

        test = 1

        res = cosine_similarity(embedding_rows, embeddings_without_chunk)
        print(f'res: {res.shape}  '.ljust(18), res[test,:n])

        top_n_indices = np.argpartition(res,-n)[:,-n:]
        top_n_values = extract_with_indices(res, top_n_indices)
        
        sorted_top_n_indices = extract_with_indices(top_n_indices, np.flip(np.argsort(top_n_values), axis=1))
        print(f'top: {sorted_top_n_indices.shape}  '.ljust(18), sorted_top_n_indices[test,:])
        
        sorted_top_n_values = extract_with_indices(res, sorted_top_n_indices)
        print(f'val: {sorted_top_n_values.shape}  '.ljust(18), sorted_top_n_values[test,:])

        recs_list.append(top_n_indices)
        print('\n', '-'*10)

    recs = np.concatenate(recs_list, axis=0)
    print(f'recs: {recs.shape}')

In [None]:
# optimized code in a function
%%time

n = 20
method = 'last_seen'
params = {}

if test_mode:
    chunk_size = 300
else:
    chunk_size = 2500

recs = pf.get_all_recommendations(n, embeddings, method, params, chunk_size)
print(recs.shape)

(364047, 20)
CPU times: user 1h 37s, sys: 1min 37s, total: 1h 2min 15s
Wall time: 35min 25s


In [None]:
save_recs = False

file_name = f'recs_idx_{n}_test' if test_mode else f'recs_idx_{n}'
recs_file_path = root_dir + '/data/' + file_name + '.npy'
print(recs_file_path)

if save_recs:
    # save and load back
    np.save(recs_file_path, recs, allow_pickle=True)
    test_load = np.load(recs_file_path)
    print(test_load.shape)

In [None]:
kill_session = False

if kill_session and using_colab:
    # kill colab session
    runtime.unassign()