In [None]:
!pip install lightfm

### V3 Recommender: LightFM


Doc: https://making.lyst.com/lightfm/docs/lightfm.html

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from datetime import datetime
from lightfm import LightFM
from scipy.sparse import csr_matrix

### Datos

In [None]:
metadata = pd.read_csv("../../data/metadata.csv", delimiter=";", parse_dates=['create_date', 'modify_date', 'start_vod_date', 'end_vod_date'])
train = pd.read_csv("../../data/train.csv", parse_dates=['tunein', 'tuneout'])

In [None]:
train.head()

In [None]:
metadata.head()

In [None]:
train_metadata = train.merge(metadata, on='asset_id', how='left')

In [None]:
train_metadata.columns

In [None]:
train_metadata = train_metadata[['account_id', 'tunein', 'tuneout', 'content_id', 'title', 'end_vod_date']]

In [None]:
train_metadata.head()

### Train test split

In [None]:
train_max_date = datetime(year=2021, month=3, day=1)

In [None]:
test = train_metadata[train_metadata.tunein > train_max_date]
train_metadata = train_metadata[train_metadata.tunein <= train_max_date]

### Filtros

Se filtran contenidos que no van a estar disponibles (ninguno de sus asset_ids) después de la fecha de train

In [None]:
train_metadata['end_vod_date'] = train_metadata['end_vod_date'].dt.tz_localize(None)


In [None]:
max_end_vod_date_by_content = train_metadata[['content_id', 'end_vod_date']].groupby('content_id', as_index=False).agg({'end_vod_date': 'max'})


In [None]:
end_before_april = max_end_vod_date_by_content[max_end_vod_date_by_content.end_vod_date < datetime(year=2021, month=3, day=1)].content_id.unique()
end_before_april[:10]

### Interacciones

Armo un dataset con las interacciones account - content.

In [None]:
interactions = train_metadata[['account_id', 'content_id']].copy()
interactions.head()

- Nulos:

In [None]:
interactions.isna().sum()

In [None]:
interactions.shape

In [None]:
interactions.account_id.nunique()

In [None]:
interactions.content_id.nunique()

- Elimino los nulos

In [None]:
interactions = interactions.dropna()

In [None]:
interactions.shape

In [None]:
interactions.head()

In [None]:
interactions = interactions.drop_duplicates()

In [None]:
interactions.shape

In [None]:
interactions.account_id.nunique()

In [None]:
interactions.content_id.nunique()

- Agrego columna **"watched"** con el valor 1 que indica que la cuenta vio el contenido

In [None]:
interactions['watched'] = 1

In [None]:
interactions.head()

### Matrix

Armo una matriz con las interacciones. Los valores van a ser 1 si el usuario vio el contenido y 0 de lo contrario

In [None]:
interactions_matrix = pd.pivot_table(interactions, index='account_id', columns='content_id', values='watched')


In [None]:
interactions_matrix.shape

In [None]:
interactions_matrix.head()

In [None]:
interactions_matrix.shape

In [None]:
interactions_matrix = interactions_matrix.fillna(0)

In [None]:
interactions_matrix.head()

- Armo diccionario de accounts que voy a usar más adelante al momento de armar las predicciones

In [None]:
acc_ids = list(interactions_matrix.index)
account_dict = {}
counter = 0 
for i in acc_ids:
    account_dict[i] = counter
    counter += 1

- Convierto a **"csr"** matrix

In [None]:
account_content_interactions = csr_matrix(interactions_matrix.values)

### Modelo

In [None]:
model = LightFM(loss='warp',
                random_state=100,
                learning_rate=0.03,
                no_components=16)

- Entrenamos el modelo **FIT**

In [None]:
%%time
model = model.fit(account_content_interactions, epochs=10)

### Popularidad para recomendaciones cold start

Para usuarios que no vieron nada les vamos a recomendar los contenidos ordenados por popularidad

In [None]:
popularity_df = train_metadata[['account_id', 'content_id']].groupby('content_id', as_index=False).agg({'account_id': 'nunique'})
popularity_df.columns = ['content_id', 'accounts']
popularity_df = popularity_df.sort_values(by='accounts', ascending=False)
popularity_df.head()

### Generación de recomendaciones

In [None]:
%%time
account_watched_contents = {}

watched_contents = interactions.groupby('account_id').agg({'content_id': 'unique'})

for account in watched_contents.index:
    watched = watched_contents.loc[account, 'content_id']
    account_watched_contents[account] = watched

In [None]:
%%time
recomms = {
    'account_id': [],
    'recomms': []
}

n_users, n_items = interactions_matrix.shape
item_ids = np.arange(n_items)

for account in tqdm(train_metadata.account_id.unique()):
    if account in list(interactions_matrix.index):
        acc_x = account_dict[account]

        preds = model.predict(user_ids=acc_x, item_ids = item_ids)

        scores = pd.Series(preds)
        scores.index = interactions_matrix.columns
        scores = list(pd.Series(scores.sort_values(ascending=False).index))[:200] # Tomo las primeras 200 para que sea más rapido

        watched_contents = account_watched_contents[account]
        scores = [x for x in scores if x not in watched_contents]
        scores = [x for x in scores if x not in end_before_april]
        scores = scores[:20]

        recomms['account_id'].append(account)
        recomms['recomms'].append(scores)
    else:
        recomms['account_id'].append(account)
        recomms['recomms'].append(popularity_df.sort_values(by='accounts', ascending=False).content_id.unique()[:20])

In [None]:
recomms = pd.DataFrame(recomms)
recomms = recomms.sort_values(by='account_id', ascending=True)

In [None]:
recomms.head()

In [None]:
recomms['n_recomms'] = recomms.recomms.apply(len)

less_than_20 = recomms[recomms.n_recomms != 20].shape[0]

if less_than_20 > 0:
    print("FALTAN RECOMMS")

### Evaluacion

In [None]:
preds = recomms.recomms.values
labels = actual_views.content_id.values

In [None]:
aps = [] # lista vacía para ir almacenando la AP de cada recomendación
for pred, label in zip(preds, labels):
    n = len(pred) 
    arange = np.arange(n, dtype=np.int32) + 1. # indexamos en base 1
    rel_k = np.in1d(pred[:n], label)
    tp = np.ones(rel_k.sum(), dtype=np.int32).cumsum() # lista con el contador de verdaderos positivos
    denom = arange[rel_k] # posiciones donde se encuentran los ítems relantes
    ap = (tp / denom).sum() / len(label) # average precision
    aps.append(ap)

In [None]:
np.mean(aps)