In [1]:
import numpy as np
import pandas as pd

# all lightfm imports 
from lightfm.data import Dataset
from lightfm import LightFM
from lightfm import cross_validation
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score

# imports re for text cleaning 
import re
from datetime import datetime, timedelta

# we will ignore pandas warning 
import warnings
warnings.filterwarnings('ignore')



In [2]:
moviedf = pd.read_csv('movies_metadata.csv')
# ratingdf = pd.read_csv('ratings.csv')
ratingdf = pd.read_csv('ratings_small.csv')
keyworddf = pd.read_csv('keywords.csv')
creditdf = pd.read_csv('credits.csv')
linkdf = pd.read_csv('links_small.csv')

In [3]:
moviedf = moviedf.drop([19730, 29503, 35587])

In [4]:
moviedf['id'] = moviedf['id'].astype('int')
moviedf['id']

0           862
1          8844
2         15602
3         31357
4         11862
          ...  
45461    439050
45462    111109
45463     67758
45464    227506
45465    461257
Name: id, Length: 45463, dtype: int32

In [5]:
linkdf = linkdf[linkdf['tmdbId'].notnull()]['tmdbId'].astype('int')
linkdf

0          862
1         8844
2        15602
3        31357
4        11862
         ...  
9120    402672
9121    315011
9122    391698
9123    137608
9124    410803
Name: tmdbId, Length: 9112, dtype: int32

In [6]:
moviedf = moviedf[moviedf['id'].isin(linkdf)]

In [7]:
print(moviedf.shape)
print(moviedf.columns.tolist())
moviedf.head(3)

(9099, 24)
['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id', 'imdb_id', 'original_language', 'original_title', 'overview', 'popularity', 'poster_path', 'production_companies', 'production_countries', 'release_date', 'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title', 'video', 'vote_average', 'vote_count']


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [8]:
ratingdf = ratingdf[ratingdf['rating'] >= 4]

In [9]:
print(ratingdf.shape)
ratingdf.head(10)

(51568, 4)


Unnamed: 0,userId,movieId,rating,timestamp
4,1,1172,4.0,1260759205
12,1,1953,4.0,1260759191
13,1,2105,4.0,1260759139
20,2,10,4.0,835355493
21,2,17,5.0,835355681
22,2,39,5.0,835355604
23,2,47,4.0,835355552
24,2,50,4.0,835355586
27,2,110,4.0,835355532
29,2,150,5.0,835355395


In [10]:
keyworddf['id'] = keyworddf['id'].astype('int')
creditdf['id'] = creditdf['id'].astype('int')
moviedf['id'] = moviedf['id'].astype('int')

In [11]:
moviedf = moviedf.merge(creditdf, on='id')
moviedf = moviedf.merge(keyworddf, on='id')

In [12]:
moviedf.iloc[1]

adult                                                                False
belongs_to_collection                                                  NaN
budget                                                            65000000
genres                   [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...
homepage                                                               NaN
id                                                                    8844
imdb_id                                                          tt0113497
original_language                                                       en
original_title                                                     Jumanji
overview                 When siblings Judy and Peter discover an encha...
popularity                                                       17.015539
poster_path                               /vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg
production_companies     [{'name': 'TriStar Pictures', 'id': 559}, {'na...
production_countries     

In [13]:
#genres, id, imdb_id, original_title, overview, vote_average, vote_count, cast, crew, keywords

In [14]:
movies = moviedf[['genres', 'id', 'imdb_id', 'original_title', 'overview', 'vote_average', 'vote_count', 'cast', 'crew', 'keywords']]
print(movies.shape)
movies.head()

(9219, 10)


Unnamed: 0,genres,id,imdb_id,original_title,overview,vote_average,vote_count,cast,crew,keywords
0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,tt0114709,Toy Story,"Led by Woody, Andy's toys live happily in his ...",7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,tt0113497,Jumanji,When siblings Judy and Peter discover an encha...,6.9,2413.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,tt0113228,Grumpier Old Men,A family wedding reignites the ancient feud be...,6.5,92.0,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,tt0114885,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",6.1,34.0,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,"[{'id': 35, 'name': 'Comedy'}]",11862,tt0113041,Father of the Bride Part II,Just when George Banks has recovered from his ...,5.7,173.0,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


# Cleaning Cast, Crew, Keyword & Genre Column

In [15]:
from ast import literal_eval

In [16]:
movies['cast'] = movies['cast'].apply(literal_eval)
movies['crew'] = movies['crew'].apply(literal_eval)
movies['keywords'] = movies['keywords'].apply(literal_eval)
movies['genres'] = movies['genres'].apply(literal_eval)

# We are fetching only director from crew and after fetching the crew column is of no use.

In [17]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [18]:
movies['director'] = movies['crew'].apply(get_director)

In [19]:
movies['director'].head()

0      John Lasseter
1       Joe Johnston
2      Howard Deutch
3    Forest Whitaker
4      Charles Shyer
Name: director, dtype: object

In [20]:
movies['director'] = movies['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))

In [21]:
movies['director'] = movies['director'].apply(lambda x: [x])

In [22]:
movies['director'].head()

0      [johnlasseter]
1       [joejohnston]
2      [howarddeutch]
3    [forestwhitaker]
4      [charlesshyer]
Name: director, dtype: object

# We are fetching the first three cast from the movie because they are famous

In [23]:
movies['cast'] = movies['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
movies['cast'] = movies['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)

In [24]:
movies['cast'].head()

0                  [Tom Hanks, Tim Allen, Don Rickles]
1       [Robin Williams, Jonathan Hyde, Kirsten Dunst]
2           [Walter Matthau, Jack Lemmon, Ann-Margret]
3    [Whitney Houston, Angela Bassett, Loretta Devine]
4           [Steve Martin, Diane Keaton, Martin Short]
Name: cast, dtype: object

In [25]:
movies['cast'] = movies['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [26]:
movies.cast.head()

0                  [tomhanks, timallen, donrickles]
1       [robinwilliams, jonathanhyde, kirstendunst]
2          [waltermatthau, jacklemmon, ann-margret]
3    [whitneyhouston, angelabassett, lorettadevine]
4           [stevemartin, dianekeaton, martinshort]
Name: cast, dtype: object

# We had the keywords with id so we are fetching only keywords from it.¶

In [27]:
movies['keywords'] = movies['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [28]:
movies['keywords'].head()

0    [jealousy, toy, boy, friendship, friends, riva...
1    [board game, disappearance, based on children'...
2    [fishing, best friend, duringcreditsstinger, o...
3    [based on novel, interracial relationship, sin...
4    [baby, midlife crisis, confidence, aging, daug...
Name: keywords, dtype: object

In [29]:
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

In [30]:
stemmer = SnowballStemmer('english')
movies['keywords'] = movies['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
movies['keywords'][0]

['jealousi',
 'toy',
 'boy',
 'friendship',
 'friend',
 'rivalri',
 'boy next door',
 'new toy',
 'toy comes to lif']

In [31]:
movies['keywords'] = movies['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
movies['keywords'][0]

['jealousi',
 'toy',
 'boy',
 'friendship',
 'friend',
 'rivalri',
 'boynextdoor',
 'newtoy',
 'toycomestolif']

# We had a genres with id so we are fetching only genres from there.

In [32]:
movies['genres'][0]

[{'id': 16, 'name': 'Animation'},
 {'id': 35, 'name': 'Comedy'},
 {'id': 10751, 'name': 'Family'}]

In [33]:
movies.columns

Index(['genres', 'id', 'imdb_id', 'original_title', 'overview', 'vote_average',
       'vote_count', 'cast', 'crew', 'keywords', 'director'],
      dtype='object')

In [34]:
movies['genres'] = movies['genres'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [35]:
movies['genres'].head()

0     [Animation, Comedy, Family]
1    [Adventure, Fantasy, Family]
2               [Romance, Comedy]
3        [Comedy, Drama, Romance]
4                        [Comedy]
Name: genres, dtype: object

# Cleaning Overview

In [36]:
result = movies.dtypes
print(result)

genres             object
id                  int32
imdb_id            object
original_title     object
overview           object
vote_average      float64
vote_count        float64
cast               object
crew               object
keywords           object
director           object
dtype: object


In [37]:
movies['overview']

0       Led by Woody, Andy's toys live happily in his ...
1       When siblings Judy and Peter discover an encha...
2       A family wedding reignites the ancient feud be...
3       Cheated on, mistreated and stepped on, the wom...
4       Just when George Banks has recovered from his ...
                              ...                        
9214    A man must cope with the loss of his wife and ...
9215    Rustom Pavri, an honourable officer of the Ind...
9216    Village lad Sarman is drawn to big, bad Mohenj...
9217    From the mind behind Evangelion comes a hit la...
9218    The band stormed Europe in 1963, and, in 1964,...
Name: overview, Length: 9219, dtype: object

In [38]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet

import warnings; warnings.simplefilter('ignore')

In [39]:
movies['overview'] = movies['overview'].fillna('')

In [40]:
import re

def clean_description(text):
    text = re.sub("\'", "", text)
    text = re.sub("[^a-zA-Z]"," ",text)
    text = ' '.join(text.split())
    text = text.lower()
    return text

movies['clean_overview'] = movies['overview'].apply(lambda x: clean_description(x))
movies['clean_overview'][0]

'led by woody andys toys live happily in his room until andys birthday brings buzz lightyear onto the scene afraid of losing his place in andys heart woody plots against buzz but when circumstances separate buzz and woody from their owner the duo eventually learns to put aside their differences'

In [41]:
movies['clean_overview'] = movies['clean_overview'].apply(lambda x: [x])

In [42]:
movies['clean_overview']

0       [led by woody andys toys live happily in his r...
1       [when siblings judy and peter discover an ench...
2       [a family wedding reignites the ancient feud b...
3       [cheated on mistreated and stepped on the wome...
4       [just when george banks has recovered from his...
                              ...                        
9214    [a man must cope with the loss of his wife and...
9215    [rustom pavri an honourable officer of the ind...
9216    [village lad sarman is drawn to big bad mohenj...
9217    [from the mind behind evangelion comes a hit l...
9218    [the band stormed europe in and in they conque...
Name: clean_overview, Length: 9219, dtype: object

# Combining

In [43]:
movies['description'] = movies['genres']+movies['clean_overview']+movies['cast']+movies['crew']+movies['keywords']
movies['description'] = movies['description'].fillna('')

In [44]:
movies['description']

0       [Animation, Comedy, Family, led by woody andys...
1       [Adventure, Fantasy, Family, when siblings jud...
2       [Romance, Comedy, a family wedding reignites t...
3       [Comedy, Drama, Romance, cheated on mistreated...
4       [Comedy, just when george banks has recovered ...
                              ...                        
9214    [Drama, a man must cope with the loss of his w...
9215    [Thriller, Romance, rustom pavri an honourable...
9216    [Adventure, Drama, History, Romance, village l...
9217    [Action, Adventure, Drama, Horror, Science Fic...
9218    [Documentary, Music, the band stormed europe i...
Name: description, Length: 9219, dtype: object

In [45]:
movies['description'] = [','.join(map(str, l)) for l in movies['description']]
movies['description']

0       Animation,Comedy,Family,led by woody andys toy...
1       Adventure,Fantasy,Family,when siblings judy an...
2       Romance,Comedy,a family wedding reignites the ...
3       Comedy,Drama,Romance,cheated on mistreated and...
4       Comedy,just when george banks has recovered fr...
                              ...                        
9214    Drama,a man must cope with the loss of his wif...
9215    Thriller,Romance,rustom pavri an honourable of...
9216    Adventure,Drama,History,Romance,village lad sa...
9217    Action,Adventure,Drama,Horror,Science Fiction,...
9218    Documentary,Music,the band stormed europe in a...
Name: description, Length: 9219, dtype: object

# Generate Function

In [46]:
def generate_int_id(dataframe, id_col_name):
    new_dataframe=dataframe.assign(
        int_id_col_name=np.arange(len(dataframe))
        ).reset_index(drop=True)
    return new_dataframe.rename(columns={'int_id_col_name': id_col_name})



def create_features(dataframe, features_name, id_col_name):
    features = dataframe[features_name].apply(
        lambda x: ','.join(x.map(str)), axis=1)
    features = features.str.split(',')
    features = list(zip(dataframe[id_col_name], features))
    return features



def generate_feature_list(dataframe, features_name):
    features = dataframe[features_name].apply(
        lambda x: ','.join(x.map(str)), axis=1)
    features = features.str.split(',')
    features = features.apply(pd.Series).stack().reset_index(drop=True)
    return features


def calculate_auc_score(lightfm_model, interactions_matrix, question_features, professional_features): 
    score = auc_score( 
        lightfm_model, interactions_matrix, 
        item_features=question_features, 
        user_features=professional_features, 
        num_threads=4).mean()
    return score

In [47]:
movies.head()

Unnamed: 0,genres,id,imdb_id,original_title,overview,vote_average,vote_count,cast,crew,keywords,director,clean_overview,description
0,"[Animation, Comedy, Family]",862,tt0114709,Toy Story,"Led by Woody, Andy's toys live happily in his ...",7.7,5415.0,"[tomhanks, timallen, donrickles]","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[jealousi, toy, boy, friendship, friend, rival...",[johnlasseter],[led by woody andys toys live happily in his r...,"Animation,Comedy,Family,led by woody andys toy..."
1,"[Adventure, Fantasy, Family]",8844,tt0113497,Jumanji,When siblings Judy and Peter discover an encha...,6.9,2413.0,"[robinwilliams, jonathanhyde, kirstendunst]","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[boardgam, disappear, basedonchildren'sbook, n...",[joejohnston],[when siblings judy and peter discover an ench...,"Adventure,Fantasy,Family,when siblings judy an..."
2,"[Romance, Comedy]",15602,tt0113228,Grumpier Old Men,A family wedding reignites the ancient feud be...,6.5,92.0,"[waltermatthau, jacklemmon, ann-margret]","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[fish, bestfriend, duringcreditssting, oldmen]",[howarddeutch],[a family wedding reignites the ancient feud b...,"Romance,Comedy,a family wedding reignites the ..."
3,"[Comedy, Drama, Romance]",31357,tt0114885,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",6.1,34.0,"[whitneyhouston, angelabassett, lorettadevine]","[{'credit_id': '52fe44779251416c91011acb', 'de...","[basedonnovel, interracialrelationship, single...",[forestwhitaker],[cheated on mistreated and stepped on the wome...,"Comedy,Drama,Romance,cheated on mistreated and..."
4,[Comedy],11862,tt0113041,Father of the Bride Part II,Just when George Banks has recovered from his ...,5.7,173.0,"[stevemartin, dianekeaton, martinshort]","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[babi, midlifecrisi, confid, age, daughter, mo...",[charlesshyer],[just when george banks has recovered from his...,"Comedy,just when george banks has recovered fr..."


In [48]:
movies = movies[['id', 'original_title', 'vote_average', 'vote_count', 'description']]
movies.head()

Unnamed: 0,id,original_title,vote_average,vote_count,description
0,862,Toy Story,7.7,5415.0,"Animation,Comedy,Family,led by woody andys toy..."
1,8844,Jumanji,6.9,2413.0,"Adventure,Fantasy,Family,when siblings judy an..."
2,15602,Grumpier Old Men,6.5,92.0,"Romance,Comedy,a family wedding reignites the ..."
3,31357,Waiting to Exhale,6.1,34.0,"Comedy,Drama,Romance,cheated on mistreated and..."
4,11862,Father of the Bride Part II,5.7,173.0,"Comedy,just when george banks has recovered fr..."


In [49]:
final_movies = generate_int_id(movies, 'movies_id_num')
final_movies.head()

Unnamed: 0,id,original_title,vote_average,vote_count,description,movies_id_num
0,862,Toy Story,7.7,5415.0,"Animation,Comedy,Family,led by woody andys toy...",0
1,8844,Jumanji,6.9,2413.0,"Adventure,Fantasy,Family,when siblings judy an...",1
2,15602,Grumpier Old Men,6.5,92.0,"Romance,Comedy,a family wedding reignites the ...",2
3,31357,Waiting to Exhale,6.1,34.0,"Comedy,Drama,Romance,cheated on mistreated and...",3
4,11862,Father of the Bride Part II,5.7,173.0,"Comedy,just when george banks has recovered fr...",4


In [50]:
final_movies = final_movies.rename(columns={'description': 'item_tag_name'})
final_movies.head()

Unnamed: 0,id,original_title,vote_average,vote_count,item_tag_name,movies_id_num
0,862,Toy Story,7.7,5415.0,"Animation,Comedy,Family,led by woody andys toy...",0
1,8844,Jumanji,6.9,2413.0,"Adventure,Fantasy,Family,when siblings judy an...",1
2,15602,Grumpier Old Men,6.5,92.0,"Romance,Comedy,a family wedding reignites the ...",2
3,31357,Waiting to Exhale,6.1,34.0,"Comedy,Drama,Romance,cheated on mistreated and...",3
4,11862,Father of the Bride Part II,5.7,173.0,"Comedy,just when george banks has recovered fr...",4


In [51]:
ratingdf.head()

Unnamed: 0,userId,movieId,rating,timestamp
4,1,1172,4.0,1260759205
12,1,1953,4.0,1260759191
13,1,2105,4.0,1260759139
20,2,10,4.0,835355493
21,2,17,5.0,835355681


In [52]:
df_merge = ratingdf.merge(final_movies, how='inner', left_on='movieId', right_on='id')
print(df_merge.shape)
df_merge

(17154, 10)


Unnamed: 0,userId,movieId,rating,timestamp,id,original_title,vote_average,vote_count,item_tag_name,movies_id_num
0,1,2105,4.0,1260759139,2105,American Pie,6.4,2358.0,"Comedy,Romance,at a high school party four fri...",2181
1,4,2105,4.0,949896114,2105,American Pie,6.4,2358.0,"Comedy,Romance,at a high school party four fri...",2181
2,15,2105,4.0,1052896867,2105,American Pie,6.4,2358.0,"Comedy,Romance,at a high school party four fri...",2181
3,34,2105,4.0,973747765,2105,American Pie,6.4,2358.0,"Comedy,Romance,at a high school party four fri...",2181
4,41,2105,5.0,1093886731,2105,American Pie,6.4,2358.0,"Comedy,Romance,at a high school party four fri...",2181
...,...,...,...,...,...,...,...,...,...,...
17149,652,6341,4.0,1439586128,6341,Blue Thunder,5.9,117.0,"Science Fiction,Action,Thriller,Crime,Drama,of...",5862
17150,652,8699,5.0,1439587538,8699,Anchorman: The Legend of Ron Burgundy,6.7,1523.0,"Comedy,its the s and san diego super sexist an...",5541
17151,652,103731,5.0,1439586954,103731,Mud,7.0,1080.0,"Drama,two teenage boys encounter a fugitive an...",8394
17152,659,167,4.0,836137550,167,K-PAX,7.1,710.0,"Drama,Science Fiction,prot is a patient at a m...",3826


In [53]:
df_merge = df_merge.iloc[0:6000]

In [54]:
user_tag = df_merge.groupby(['userId'])['item_tag_name'].apply(','.join).reset_index()
user_tag = user_tag.rename(columns={'item_tag_name': 'user_tag_name'})
user_tag

Unnamed: 0,userId,user_tag_name
0,1,"Comedy,Romance,at a high school party four fri..."
1,2,"Drama,Mystery,Romance,red this is the third fi..."
2,3,"Drama,Mystery,Romance,red this is the third fi..."
3,4,"Comedy,Romance,at a high school party four fri..."
4,5,"Thriller,Action,Comedy,Crime,Drama,a hard nose..."
...,...,...
627,667,"Drama,Mystery,Romance,red this is the third fi..."
628,668,"Action,Thriller,Science Fiction,its been years..."
629,669,"Comedy,newly engaged ben and sadie cant wait t..."
630,670,"Drama,a drama about a maori family lving in au..."


In [55]:
user_tag = generate_int_id(user_tag, 'order_id_num')
user_tag

Unnamed: 0,userId,user_tag_name,order_id_num
0,1,"Comedy,Romance,at a high school party four fri...",0
1,2,"Drama,Mystery,Romance,red this is the third fi...",1
2,3,"Drama,Mystery,Romance,red this is the third fi...",2
3,4,"Comedy,Romance,at a high school party four fri...",3
4,5,"Thriller,Action,Comedy,Crime,Drama,a hard nose...",4
...,...,...,...
627,667,"Drama,Mystery,Romance,red this is the third fi...",627
628,668,"Action,Thriller,Science Fiction,its been years...",628
629,669,"Comedy,newly engaged ben and sadie cant wait t...",629
630,670,"Drama,a drama about a maori family lving in au...",630


In [56]:
final_merge = df_merge.merge(user_tag, how='inner', left_on='userId', right_on='userId')
print(final_merge.shape)
final_merge.head()

(6000, 12)


Unnamed: 0,userId,movieId,rating,timestamp,id,original_title,vote_average,vote_count,item_tag_name,movies_id_num,user_tag_name,order_id_num
0,1,2105,4.0,1260759139,2105,American Pie,6.4,2358.0,"Comedy,Romance,at a high school party four fri...",2181,"Comedy,Romance,at a high school party four fri...",0
1,4,2105,4.0,949896114,2105,American Pie,6.4,2358.0,"Comedy,Romance,at a high school party four fri...",2181,"Comedy,Romance,at a high school party four fri...",3
2,4,153,4.0,949811346,153,Lost in Translation,7.3,1943.0,"Drama,two lost souls visiting tokyo the young ...",4815,"Comedy,Romance,at a high school party four fri...",3
3,4,296,5.0,949895708,296,Terminator 3: Rise of the Machines,5.9,2177.0,"Action,Thriller,Science Fiction,its been years...",4739,"Comedy,Romance,at a high school party four fri...",3
4,4,480,5.0,949810582,480,Monsoon Wedding,6.8,59.0,"Comedy,Drama,Romance,from an exciting indian w...",3994,"Comedy,Romance,at a high school party four fri...",3


In [57]:
user_feature_list = generate_feature_list(user_tag,['user_tag_name'])

item_feature_list = generate_feature_list(final_movies,['item_tag_name'])

In [58]:
# creating features for feeding into lightfm 
user_tag['user_features'] = create_features(user_tag, ['user_tag_name'], 'order_id_num')
user_tag['user_features']

0      (0, [Comedy, Romance, at a high school party f...
1      (1, [Drama, Mystery, Romance, red this is the ...
2      (2, [Drama, Mystery, Romance, red this is the ...
3      (3, [Comedy, Romance, at a high school party f...
4      (4, [Thriller, Action, Comedy, Crime, Drama, a...
                             ...                        
627    (627, [Drama, Mystery, Romance, red this is th...
628    (628, [Action, Thriller, Science Fiction, its ...
629    (629, [Comedy, newly engaged ben and sadie can...
630    (630, [Drama, a drama about a maori family lvi...
631    (631, [Action, Thriller, Science Fiction, its ...
Name: user_features, Length: 632, dtype: object

In [59]:
final_movies['item_features'] = create_features(final_movies,['item_tag_name'],'movies_id_num')
final_movies['item_features']

0       (0, [Animation, Comedy, Family, led by woody a...
1       (1, [Adventure, Fantasy, Family, when siblings...
2       (2, [Romance, Comedy, a family wedding reignit...
3       (3, [Comedy, Drama, Romance, cheated on mistre...
4       (4, [Comedy, just when george banks has recove...
                              ...                        
9214    (9214, [Drama, a man must cope with the loss o...
9215    (9215, [Thriller, Romance, rustom pavri an hon...
9216    (9216, [Adventure, Drama, History, Romance, vi...
9217    (9217, [Action, Adventure, Drama, Horror, Scie...
9218    (9218, [Documentary, Music, the band stormed e...
Name: item_features, Length: 9219, dtype: object

In [60]:
dataset = Dataset()
dataset.fit(
    set(user_tag['order_id_num']),
    set(final_movies['movies_id_num']), 
    item_features=item_feature_list, 
    user_features=user_feature_list)

In [61]:
final_merge['user_item_id_tuple'] = list(zip(final_merge.order_id_num, final_merge.movies_id_num))
final_merge['user_item_id_tuple']

0         (0, 2181)
1         (3, 2181)
2         (3, 4815)
3         (3, 4739)
4         (3, 3994)
           ...     
5995    (209, 3171)
5996    (474, 2634)
5997    (485, 1347)
5998    (485, 5868)
5999    (496, 1347)
Name: user_item_id_tuple, Length: 6000, dtype: object

In [62]:
interactions, weights = dataset.build_interactions(final_merge['user_item_id_tuple'])

In [63]:
item_features = dataset.build_item_features(final_movies['item_features'])
print(item_features)

  (0, 0)	0.0013175231
  (0, 9219)	0.0013175231
  (0, 9220)	0.0013175231
  (0, 9221)	0.0013175231
  (0, 9222)	0.0013175231
  (0, 9223)	0.0013175231
  (0, 9224)	0.0013175231
  (0, 9225)	0.0013175231
  (0, 9226)	0.0013175231
  (0, 9227)	0.0039525693
  (0, 9228)	0.038208168
  (0, 9229)	0.0026350461
  (0, 9230)	0.0013175231
  (0, 9231)	0.0026350461
  (0, 9232)	0.0026350461
  (0, 9233)	0.0013175231
  (0, 9234)	0.010540185
  (0, 9235)	0.0013175231
  (0, 9236)	0.0052700923
  (0, 9237)	0.0013175231
  (0, 9238)	0.0013175231
  (0, 9239)	0.0013175231
  (0, 9240)	0.0039525693
  (0, 9241)	0.0039525693
  (0, 9242)	0.0039525693
  :	:
  (9218, 17492)	0.01724138
  (9218, 18309)	0.03448276
  (9218, 18310)	0.03448276
  (9218, 18311)	0.03448276
  (9218, 92296)	0.01724138
  (9218, 96126)	0.01724138
  (9218, 96127)	0.01724138
  (9218, 107884)	0.01724138
  (9218, 107885)	0.01724138
  (9218, 170954)	0.01724138
  (9218, 293058)	0.01724138
  (9218, 293059)	0.01724138
  (9218, 293060)	0.01724138
  (9218, 373067)	

In [64]:
user_features = dataset.build_user_features(user_tag['user_features'])
print(user_features)

  (0, 0)	0.0016051364
  (0, 632)	0.0016051364
  (0, 633)	0.0016051364
  (0, 634)	0.0016051364
  (0, 635)	0.0016051364
  (0, 636)	0.0016051364
  (0, 637)	0.0016051364
  (0, 638)	0.0016051364
  (0, 639)	0.024077047
  (0, 640)	0.033707865
  (0, 641)	0.0016051364
  (0, 642)	0.0048154094
  (0, 643)	0.0016051364
  (0, 644)	0.12841092
  (0, 645)	0.0016051364
  (0, 646)	0.024077047
  (0, 647)	0.09309791
  (0, 648)	0.0016051364
  (0, 649)	0.0016051364
  (0, 650)	0.0016051364
  (0, 651)	0.0016051364
  (0, 652)	0.0064205457
  (0, 653)	0.011235955
  (0, 654)	0.0016051364
  (0, 655)	0.0016051364
  :	:
  (631, 11885)	0.0002737476
  (631, 11886)	0.0002737476
  (631, 11887)	0.0002737476
  (631, 11888)	0.0002737476
  (631, 11889)	0.0002737476
  (631, 11890)	0.0002737476
  (631, 11891)	0.0002737476
  (631, 11892)	0.0002737476
  (631, 11893)	0.0002737476
  (631, 11894)	0.0002737476
  (631, 11895)	0.0002737476
  (631, 11896)	0.0002737476
  (631, 11897)	0.0002737476
  (631, 11898)	0.0002737476
  (631, 1189

In [65]:
model = LightFM(
    no_components=150,
    learning_rate=0.05,
    loss='warp',
    random_state=2019)

model.fit(
    interactions,
    item_features=item_features,
    user_features=user_features, sample_weight=weights,
    epochs=5, num_threads=4, verbose=True)

Epoch: 100%|█████████████████████████████████████████████████████████████████████████████| 5/5 [04:02<00:00, 48.41s/it]


<lightfm.lightfm.LightFM at 0x2936c44df70>

In [66]:
calculate_auc_score(model, interactions, item_features, user_features)

0.9973863

In [96]:
from IPython.display import display_html
def display_side_by_side(*args):
    html_str=''
    for df in args:
        html_str+=df.to_html()
    display_html(html_str.replace('table','table style="display:inline"'),raw=True)

def recommend_item(user_ids):
     
    for user in user_ids:
        # print their previous answered question title
        previous_item_num = final_merge.loc[final_merge['order_id_num'] == user][:3]['movies_id_num']
        df_previous_items = final_movies.loc[final_movies['movies_id_num'].isin(previous_item_num)]
        print('User Id (' + str(user) + "): Previous Item Purchased")
        display(df_previous_items[['movies_id_num', 'original_title']])
        
        # predict
        discard_qu_id = df_previous_items['movies_id_num'].values.tolist()
        df_use_for_prediction = final_movies.loc[~final_movies['movies_id_num'].isin(discard_qu_id)]
        items_id_for_predict = df_use_for_prediction['movies_id_num'].values.tolist()
        
        scores = model.predict(
            user,
            items_id_for_predict,
            item_features=item_features,
            user_features=user_features)
        
        df_use_for_prediction['scores'] = scores
        df_use_for_prediction = df_use_for_prediction.sort_values(by='scores', ascending=False)[:8]
        print()
        print('User Id (' + str(user) + "): Recommended Item: ")
        display(df_use_for_prediction[['movies_id_num', 'original_title']])

In [97]:
recommend_item([1])

User Id (1): Previous Item Purchased


Unnamed: 0,movies_id_num,original_title
274,274,Trois couleurs : Rouge
3938,3938,48 Hrs.
4815,4815,Lost in Translation



User Id (1): Recommended Item: 


Unnamed: 0,movies_id_num,original_title
260,260,Once Were Warriors
3323,3323,The Million Dollar Hotel
3468,3468,Point Break
794,794,The 39 Steps
3994,3994,Monsoon Wedding
4460,4460,The Hours
6797,6797,License to Wed
4278,4278,Mosura tai Gojira
