# Collaborative filtering

In [1]:
#### import json
import pickle
import datetime as dt
from itertools import chain
from collections import Counter

import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix, csr_matrix

from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

np.random.seed(16)
np.set_printoptions(precision=2)
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 2)

In [2]:
%run Functions.ipynb

In [4]:
# Varibles
is_test = True

sample_data = None  # 100_000

if is_test:
    filter_date = dt.datetime(2021, 3, 10, 0, 0, 0).date()
else:
    filter_date = dt.datetime(2021, 4, 1, 0, 0, 0).date()

categories_list = [
                   'accion', 'animacion', 'animales', 'aventura', 'belico', 'biografia', 'ciencia',
                   'ciencia ficcion', 'cocina', 'comedia', 'competencia', 'crimen', 'cultura', 'deporte',
                   'dibujos animados', 'documental', 'drama', 'entretenimiento', 'entrevistas', 'espectaculo',
                   'familia', 'fantasia', 'historia', 'humor', 'infantil', 'interes general', 'investigacion',
                   'magazine', 'moda', 'musica', 'naturaleza', 'periodistico', 'policial', 'politico', 'reality',
                   'religion', 'restauracion', 'romance', 'suspenso', 'teatro', 'terror', 'viajes', 'western'
                  ]

In [6]:
if is_test:
    df, df_test = create_dfs(sample_data=sample_data, clean=True)  # to test
else:
    df, _ = create_dfs(sample_data=sample_data, ret_test=False, clean=True)  # to submit
df.tail(3)

Unnamed: 0,customer_id,account_id,device_type,asset_id,tunein,resume,min_watching,tunein_hour,content_id,released_year,description,cast_first_name,credits_first_name,audience,made_for_tv,pay_per_view,pack_premium_1,pack_premium_2,end_vod_date,run_time_min,show_type,country_of_origin,accion,animacion,animales,aventura,belico,biografia,ciencia,ciencia ficcion,cocina,comedia,competencia,crimen,cultura,deporte,dibujos animados,documental,drama,entretenimiento,entrevistas,espectaculo,familia,fantasia,historia,humor,infantil,interes general,investigacion,magazine,moda,musica,naturaleza,periodistico,policial,politico,reality,religion,restauracion,romance,suspenso,teatro,terror,viajes,western,title,keywords,ranking
3083503,90638,112255,STATIONARY,13829.0,2021-03-09 22:56:00,0,56.0,22,2043.0,2017.0,"Nelson conoce al 'Chelo' Esculapio, gallero y ...","Luis Brandoni, Peter Lanzani, Luis Luque, Juli...",Bruno Stagnaro,General,0,0,0,0,2027-08-01,74.0,Serie,AR,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,un gallo para esculapio,"pandillas,crimen,droga,mafia,venganza,robo",7
3083541,90644,112260,STATIONARY,16383.0,2021-03-09 22:11:00,0,46.0,22,3825.0,2016.0,Cuando Clayton regresa a casa después de celeb...,"John Cusack, Samuel L. Jackson, Isabelle Fuhrm...",Tod Williams,General,0,0,0,0,2022-09-30,97.0,Película,US,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,el pulso,"apocalipsis,familia,supervivencia,de libros,zo...",4
3084127,50246,112348,STB,29798.0,2021-03-09 21:41:00,1,12.0,21,2163.0,2016.0,"La Gallina Pintadita Mini trae historias, acti...",unknow-actors,Juliano Prado,Preescolar,0,0,0,0,2022-04-30,12.0,TV,BR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,gallina pintadita mini,educativo,10


### Create view matrix

In [7]:
content_ids_available = df[df.end_vod_date.dt.date > filter_date].content_id.value_counts().index.values
content_ids_available

array([2040., 2160., 1139., ..., 2911., 2719., 2505.])

In [8]:
df_views = df.groupby(['account_id', 'content_id'], as_index=False)['ranking'].max()

df_views.tail()

Unnamed: 0,account_id,content_id,ranking
751253,112278,1316.0,1
751254,112298,3941.0,1
751255,112348,1983.0,8
751256,112348,2163.0,10
751257,112356,1018.0,1


In [9]:
df_views.loc[:, 'account_id'] = 'U' + df_views.account_id.astype(int).astype(str)
df_views.loc[:, 'content_id'] = 'C' + df_views.content_id.astype(int).astype(str)
df_views.loc[:, 'ranking'] = df_views.ranking.astype(int)

In [10]:
del df

In [11]:
def get_similarity_content2content(user_ids, content_ids):
    """
    Calcula similarity content 2 content
    """
    content_user_matrix = csr_matrix(([1]*len(user_ids), (content_ids, user_ids)))
    
    similarity = cosine_similarity(content_user_matrix)
    
    return similarity, content_user_matrix


def get_recommendations_from_similarity(similarity_matrix, content_user_matrix, top_n=20):

    user_content_matrix = csr_matrix(content_user_matrix.T)
    user_content_score = user_content_matrix.dot(similarity_matrix) # sum of similarities to all content
    
    recommendation_for_user = []
    for user_id in range(user_content_score.shape[0]):
        scores = user_content_score[user_id, :]
        
        content_seen = user_content_matrix.indices[
            user_content_matrix.indptr[user_id]:user_content_matrix.indptr[user_id+1]
        ]
        scores[content_seen] = -1 # do not recommend already seen content

        top_contents_ids = np.argsort(scores)[-top_n:][::-1]

        recommendations = pd.Series({ user_id: top_contents_ids })

        recommendation_for_user.append(recommendations)
    return pd.concat(recommendation_for_user)


def get_recommendations(data_views, top_n=20):
    
    ###########
    # To identify after matrix factorization 
    user_label_encoder = LabelEncoder()
    user_ids = user_label_encoder.fit_transform(data_views.account_id)

    content_label_encoder = LabelEncoder()
    content_ids = content_label_encoder.fit_transform(data_views.content_id)
    ###########

    # compute recommendations
    similarity_matrix, content_user_matrix = get_similarity_content2content(user_ids, content_ids)
    recommendations = get_recommendations_from_similarity(similarity_matrix, content_user_matrix, top_n=top_n)

    ###########
    # Rollback ids
    recommendations.index = user_label_encoder.inverse_transform(recommendations.index)
    recommendations = recommendations.map(lambda xs: content_label_encoder.inverse_transform(xs))
    
    recommendations.index = recommendations.index.map(lambda x: x[1:]).astype(int)
    recommendations = recommendations.map(lambda xs: [x[1:] for x in xs])
    ###########
    
    return recommendations.sort_index()

In [12]:
df_views.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 751258 entries, 0 to 751257
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   account_id  751258 non-null  object
 1   content_id  751258 non-null  object
 2   ranking     751258 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 22.9+ MB


In [17]:
%%time

recommendations = get_recommendations(df_views, top_n=20)  # We use 40 to get more available items

recommendations

CPU times: user 1min 19s, sys: 1.92 s, total: 1min 21s
Wall time: 1min 21s


0         [1488, 2277, 2677, 1363, 3119, 4033, 3364, 100...
1         [1760, 1759, 2040, 1758, 1971, 2012, 3716, 173...
2         [1576, 1698, 1495, 3716, 2757, 1971, 2811, 36,...
3         [3711, 3210, 1008, 1345, 580, 3213, 3744, 173,...
4         [3710, 3707, 3900, 3709, 3210, 173, 3204, 2677...
                                ...                        
112271    [1847, 2815, 2323, 2817, 2827, 2120, 2809, 225...
112278    [304, 1803, 2160, 1139, 607, 496, 2163, 20, 49...
112298    [3938, 3972, 3897, 1945, 3959, 3939, 3546, 327...
112348    [2172, 607, 304, 496, 1316, 562, 493, 491, 635...
112356    [1836, 3891, 176, 1362, 1797, 1800, 1798, 1443...
Length: 103622, dtype: object

In [19]:
if is_test:
    print('map', mean_avg_precision(df_test, recommendations))
else:
    write_submit(recommendations)

map 0.0


### keep recomendacion that will keep availables

- Vamos a remover los items que no estaran disponibles y completamos con el top