In [31]:
import numpy as np
import pandas as pd
import re
from pandas.testing import assert_frame_equal

In [32]:
#movies = pd.read_csv("ml-1m/movies.dat", sep = '::', header = None, engine = 'python', usecols = range(1,3))
#ratings = pd.read_csv("ml-1m/ratings.dat", sep = '::', header = None, engine = 'python')
#users = pd.read_csv("ml-1m/users.dat", sep = '::', header = None, engine = 'python')

In [33]:
movies = pd.read_csv("ml-latest/movies.csv", index_col = "movieId")
all_tags = pd.read_csv("ml-latest/all-tags.csv", usecols = range(1,3))
ratings = pd.read_csv("ml-latest/ratings.csv")
links = pd.read_csv("ml-latest/links.csv")

# Part I

### data preprocessing

In [34]:
#In the following part, we want to find tags for each movie, the TF_IDF model should be used when rating tags
#Next we need to find the top N tags
#First we need to preprocess the data

In [35]:
#First, we need to process the data of genres as it is in string form
movies["genres"] = movies["genres"].apply(lambda x: x.split("|"))

In [36]:
movies

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
2,Jumanji (1995),"[Adventure, Children, Fantasy]"
3,Grumpier Old Men (1995),"[Comedy, Romance]"
4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
5,Father of the Bride Part II (1995),[Comedy]
...,...,...
193581,Black Butler: Book of the Atlantic (2017),"[Action, Animation, Comedy, Fantasy]"
193583,No Game No Life: Zero (2017),"[Animation, Comedy, Fantasy]"
193585,Flint (2017),[Drama]
193587,Bungo Stray Dogs: Dead Apple (2018),"[Action, Animation]"


In [37]:
#to find all of the tags of the data
tags = all_tags.groupby("movieId").agg(list)

In [38]:
#Next we need to merge two dataframe
ret = movies.join(tags, on = "movieId")

In [39]:
ret

Unnamed: 0_level_0,title,genres,tag
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]","[animated, buddy movie, Cartoon, cgi, comedy, ..."
2,Jumanji (1995),"[Adventure, Children, Fantasy]","[fantasy, adapted from:book, animals, bad cgi,..."
3,Grumpier Old Men (1995),"[Comedy, Romance]","[moldy, old, Ann Margaret, Burgess Meredith, D..."
4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]","[characters, girl movie, characters, chick fli..."
5,Father of the Bride Part II (1995),[Comedy],"[steve martin, steve martin, pregnancy, remake..."
...,...,...,...
193581,Black Butler: Book of the Atlantic (2017),"[Action, Animation, Comedy, Fantasy]",
193583,No Game No Life: Zero (2017),"[Animation, Comedy, Fantasy]",
193585,Flint (2017),[Drama],
193587,Bungo Stray Dogs: Dead Apple (2018),"[Action, Animation]",


In [40]:
df = map(lambda x: (x[0], x[1], x[2], x[2] + x[3]) if x[3] is not np.nan else (x[0], x[1], x[2], x[2]), ret.itertuples())

In [41]:
dataframe = pd.DataFrame(df, columns = ['movieId', 'title', 'genres', 'tags'])

In [56]:
dataframe.set_index('movieId')

Unnamed: 0_level_0,title,genres,tags
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]","[Adventure, Animation, Children, Comedy, Fanta..."
2,Jumanji (1995),"[Adventure, Children, Fantasy]","[Adventure, Children, Fantasy, fantasy, adapte..."
3,Grumpier Old Men (1995),"[Comedy, Romance]","[Comedy, Romance, moldy, old, Ann Margaret, Bu..."
4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]","[Comedy, Drama, Romance, characters, girl movi..."
5,Father of the Bride Part II (1995),[Comedy],"[Comedy, steve martin, steve martin, pregnancy..."
...,...,...,...
193581,Black Butler: Book of the Atlantic (2017),"[Action, Animation, Comedy, Fantasy]","[Action, Animation, Comedy, Fantasy]"
193583,No Game No Life: Zero (2017),"[Animation, Comedy, Fantasy]","[Animation, Comedy, Fantasy]"
193585,Flint (2017),[Drama],[Drama]
193587,Bungo Stray Dogs: Dead Apple (2018),"[Action, Animation]","[Action, Animation]"


### TF-IDF model to find the top N tags

In [64]:
#from sklearn.feature_extraction.text import TfidfTransformer
#from sklearn.feature_extraction.text import CountVectorizer
#transformer = TfidfTransformer() #This is to find the term frequency matrix
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
import pprint

In [66]:
dataset = dataframe['tags'].values

In [67]:
#This part is to get rid of the nan from the list
cleaned_dataset = []
for i in dataset:
    tmp = []
    for j in i:
        if isinstance(j, float):
            continue
        tmp.append(j)
    cleaned_dataset.append(tmp)

In [68]:
dct = Dictionary(cleaned_dataset)

In [69]:
corpus = [dct.doc2bow(line) for line in cleaned_dataset]

In [70]:
model = TfidfModel(corpus)

In [74]:
model[corpus][2]

[(12, 0.051747063955290044),
 (15, 0.021957016542248128),
 (63, 0.054649787813602545),
 (84, 0.056864078583617375),
 (178, 0.21288710924763993),
 (179, 0.17135464886250154),
 (180, 0.1619560862600595),
 (181, 0.13564758341236932),
 (182, 0.17135464886250154),
 (183, 0.29723862736449447),
 (184, 0.08839729073297667),
 (185, 0.04138540761891935),
 (186, 0.18742159775384973),
 (187, 0.3105753999423067),
 (188, 0.11131301365230653),
 (189, 0.21288710924763993),
 (190, 0.21288710924763993),
 (191, 0.07853618600270869),
 (192, 0.29723862736449447),
 (193, 0.12085690405046519),
 (194, 0.19682016035629177),
 (195, 0.42577421849527985),
 (196, 0.2571378522072838),
 (197, 0.1573045982378806),
 (198, 0.21288710924763993),
 (199, 0.14143347340990145),
 (200, 0.15528769997115335)]

In [71]:
#TF-IDF
len(model[corpus])

9742

In [72]:
#In the next part, we need to calculate the top-N words
movie_profile = {}
for i, element in enumerate(dataframe.index):
    tfidf_vec = model[corpus[i]]
    #sort and select top 30 value
    movie_tag = sorted(tfidf_vec, key = lambda x: x[1], reverse = True)[:30]
    movie_profile[element] = dict(map(lambda x:(dct[x[0]], x[1]), movie_tag))

In [75]:
movie_profile[2]

{'moldy': 0.42577421849527985,
 'Walter Matthau': 0.3105753999423067,
 'Jack Lemmon': 0.29723862736449447,
 'fishing': 0.29723862736449447,
 'old': 0.2571378522072838,
 'Ann Margaret': 0.21288710924763993,
 'comedinha de velhinhos engraÃ§ada': 0.21288710924763993,
 'comedinha de velhinhos engraÃƒÂ§ada': 0.21288710924763993,
 'old people that is actually funny': 0.21288710924763993,
 'grun running': 0.19682016035629177,
 'Sophia Loren': 0.18742159775384973,
 'Burgess Meredith': 0.17135464886250154,
 'Howard Deutch': 0.17135464886250154,
 'Daryl Hannah': 0.1619560862600595,
 'old man': 0.1573045982378806,
 'sequel fever': 0.15528769997115335,
 'sequel': 0.14143347340990145,
 'Funniest Movies': 0.13564758341236932,
 'good soundtrack': 0.12085690405046519,
 'best friend': 0.11131301365230653,
 'NO_FA_GANES': 0.08839729073297667,
 'duringcreditsstinger': 0.07853618600270869,
 'funny': 0.056864078583617375,
 'comedy': 0.054649787813602545,
 'CLV': 0.051747063955290044,
 'Romance': 0.04138540