# Hybrid Recommendation System using LightFM
- [Medium Reference](https://medium.com/@dikosaktiprabowo/hybrid-recommendation-system-using-lightfm-e10dd6b42923)

In [73]:
from pathlib import Path

import pandas as pd
import numpy as np

from lightfm.data import Dataset
from lightfm.cross_validation import random_train_test_split

from lightfm import LightFM # model
from lightfm.evaluation import precision_at_k, recall_at_k, auc_score # evaluation

In [11]:
base_path = Path("__file__").resolve().parents[2]
data_path = base_path / "data"

In [12]:
df_list = []

## Load data (users, movies, rating)

In [38]:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
m_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url', 'action', 'adventure', 'animation', 'children','comedy', 'crime','documentary', 'drama', 'fantasy', 'film_noir','horror', 'musical', 'mystery', 'romance',' scifi', 'thriller', 'war', 'western', 'no_genre']
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
df_list = []
for i,c,s in zip(['user', 'item', 'data'], [u_cols,m_cols,r_cols], ['|','|','\t']):
    filename = 'u.'+i
    file_path = data_path / "movielens" / "ml-100k" / filename
    temp = pd.read_csv(file_path, sep=s, names=c,
                    encoding='latin-1')
    df_list.append(temp)
user, item, rating = df_list[0].copy(),df_list[1].copy(),df_list[2].copy()


## Data Preprocessing

### User

#### Create binning for Age
- Check quantiles to create four groups proportionally

In [39]:
pd.qcut(user['age'],4).head()

0    (6.999, 25.0]
1     (43.0, 73.0]
2    (6.999, 25.0]
3    (6.999, 25.0]
4     (31.0, 43.0]
Name: age, dtype: category
Categories (4, interval[float64, right]): [(6.999, 25.0] < (25.0, 31.0] < (31.0, 43.0] < (43.0, 73.0]]

- Create adjusted binning

In [40]:
user['age_bin'] = pd.cut(user['age'], bins=[0,25,30,45,np.inf], labels= ['<= 25', '26 - 30', '31 - 45', '>= 45'])

In [41]:
user.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code,age_bin
0,1,24,M,technician,85711,<= 25
1,2,53,F,other,94043,>= 45
2,3,23,M,writer,32067,<= 25
3,4,24,M,technician,43537,<= 25
4,5,33,F,other,15213,31 - 45


In [51]:
user_features = pd.get_dummies(user.drop(columns = ['age','zip_code']), dtype=np.int32)

In [52]:
user_features.head()

Unnamed: 0,user_id,sex_F,sex_M,occupation_administrator,occupation_artist,occupation_doctor,occupation_educator,occupation_engineer,occupation_entertainment,occupation_executive,...,occupation_retired,occupation_salesman,occupation_scientist,occupation_student,occupation_technician,occupation_writer,age_bin_<= 25,age_bin_26 - 30,age_bin_31 - 45,age_bin_>= 45
0,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
1,2,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,3,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
3,4,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
4,5,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [53]:
user_features_col = user_features.drop(columns=['user_id']).columns.values
user_feat = user_features.drop(columns=['user_id']).to_dict(orient='records')

In [57]:
print(user_features_col)

['sex_F' 'sex_M' 'occupation_administrator' 'occupation_artist'
 'occupation_doctor' 'occupation_educator' 'occupation_engineer'
 'occupation_entertainment' 'occupation_executive' 'occupation_healthcare'
 'occupation_homemaker' 'occupation_lawyer' 'occupation_librarian'
 'occupation_marketing' 'occupation_none' 'occupation_other'
 'occupation_programmer' 'occupation_retired' 'occupation_salesman'
 'occupation_scientist' 'occupation_student' 'occupation_technician'
 'occupation_writer' 'age_bin_<= 25' 'age_bin_26 - 30' 'age_bin_31 - 45'
 'age_bin_>= 45']


In [63]:
user.shape

(943, 6)

### Movie
- For movie, only  "genre" feature needed and it is already one hot encoded.

In [25]:
item.head()

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url,action,adventure,animation,children,comedy,...,film_noir,horror,musical,mystery,romance,scifi,thriller,war,western,no_genre
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [54]:
item_features = item.drop(columns=['title', 'release_date', 'video_release_date', 'imdb_url'])
item_features_col = item_features.drop(columns=['movie_id']).columns.values
item_feat = item_features.drop(columns =['movie_id']).to_dict(orient='records')

In [58]:
print(item_features_col)

['action' 'adventure' 'animation' 'children' 'comedy' 'crime'
 'documentary' 'drama' 'fantasy' 'film_noir' 'horror' 'musical' 'mystery'
 'romance' ' scifi' 'thriller' 'war' 'western' 'no_genre']


In [64]:
item.shape

(1682, 24)

### user-item interaction

In [27]:
rating.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


- We created a new feature with the assumption that rating of 3 and higher classify as liked by user

In [28]:
rating['liked'] = np.where(rating['rating'] >=3,1,0)

In [62]:
rating.shape

(100000, 4)

## LightFM model

### Dataset
- Fit `users`, `items`, `user features`, `item features` into lightFM's `Dataset()` object to create mappings

In [59]:
dataset = Dataset()
dataset.fit(
    users=[x for x in user['user_id']], 
    items=[x for x in item['movie_id']], 
    item_features=item_features_col, 
    user_features=user_features_col
)

In [60]:
num_users, num_items = dataset.interactions_shape()

In [61]:
print(f"Number of users: {num_users}")
print(f"Number of items: {num_items}")

Number of users: 943
Number of items: 1682


#### Build `item_features` to be fitted into the model

In [65]:
item_features = dataset.build_item_features((x,y) for x,y in zip(item_features['movie_id'],item_feat))

#### Build `user_features` to be fitted into the model

In [66]:
user_features = dataset.build_user_features((x,y) for x,y in zip(user['user_id'],user_feat))

#### Build interactions (user — item) and its respective weights (in this case each user’s movie rating score)

In [67]:
(interactions, weights) = dataset.build_interactions((x, y) for x,y in zip(rating['user_id'], rating['movie_id']))

### Model Training

#### Split train test

In [69]:
# split interactions
train, test = random_train_test_split(interactions,test_percentage=0.2, random_state=779)
# split weights
train_w, test_w = random_train_test_split(weights, test_percentage=0.2, random_state=779)

### Create Model

In [71]:
n_components = 30
loss = 'warp'
epoch = 30
num_thread = 4
model = LightFM(no_components= n_components, loss=loss, random_state = 1616)
model.fit(train,  user_features= user_features, item_features= item_features, epochs=epoch,num_threads = num_thread, sample_weight = train_w)

<lightfm.lightfm.LightFM at 0x3200a6710>

### Model Evaluation
- Precision and Recall will be calculated by k number of top recommendations. 

In [76]:
k = 10
train_precision = precision_at_k(model, train, k=k,item_features=item_features, user_features=user_features).mean()
test_precision = precision_at_k(model, test, train_interactions=train, k=k,item_features=item_features, user_features=user_features).mean()

train_recall = recall_at_k(model, train, k=k,item_features=item_features, user_features=user_features).mean()
test_recall = recall_at_k(model, test,train_interactions=train, k=k,item_features=item_features, user_features=user_features).mean()

train_auc = auc_score(model, train,item_features=item_features, user_features=user_features).mean()
test_auc = auc_score(model, test, train_interactions=train,item_features=item_features, user_features=user_features).mean()

In [79]:
print(f"Precision@{k} - Train: {train_precision:.2f}")
print(f"Precision@{k} - Test: {test_precision:.2f}")

print(f"Recall@{k} - Train: {train_recall:.2f}")
print(f"Recall@{k} - Test: {test_recall:.2f}")

print(f"AUC - Train: {train_auc:.2f}")
print(f"AUC - Test: {train_auc:.2f}")

Precision@10 - Train: 0.49
Precision@10 - Test: 0.25
Recall@10 - Train: 0.09
Recall@10 - Test: 0.13
AUC - Train: 0.90
AUC - Test: 0.90
