In [2]:
from IPython.display import display
import matplotlib.pyplot as plt
import pandas as pd

data_folder = "./ml-20m"

ratings_df = pd.read_csv(data_folder + "/ratings.csv")
movies_df = pd.read_csv(data_folder + "/movies.csv")

ratings_df = ratings_df.rename(columns={'userId': 'user', 'movieId': 'item'})
movies_df = movies_df.rename(columns={'movieId': 'item', 'title': 'title_original'})

In [3]:
def compute_title_year(row):
    # read the title of the specified row, and cast it to the str type,
    # then use the strip() method to eliminate the spaces from the beginning and the end of the string
    title_original = str(row['title_original']).strip()

    # select the substring from the fifth-to-last (included) character to the last (excluded)
    year = title_original[-5:-1]

    # use the isdigit method to determine if the substring selected is a number
    if year.isdigit():
        # remove the year from the title
        # select the substring from the beninning to the sixth-to-last (excluded)
        # then use the strip() method to remove spaces at the beginning and at the end of the string
        # finally, use the lower() method to obtain the title in lower case
        title = title_original[:-6].strip().lower()

        # return the year and the title
        return int(year), title
    else:
        # nothing to do, return 0 as year and the title
        return 0, title_original.lower()


movies_df[['year', 'title']] = movies_df.apply(compute_title_year, axis=1, result_type='expand')
movies_df = movies_df[['item', 'title', 'year', 'genres']]
display(movies_df)

Unnamed: 0,item,title,year,genres
0,1,toy story,1995,Adventure|Animation|Children|Comedy|Fantasy
1,2,jumanji,1995,Adventure|Children|Fantasy
2,3,grumpier old men,1995,Comedy|Romance
3,4,waiting to exhale,1995,Comedy|Drama|Romance
4,5,father of the bride part ii,1995,Comedy
...,...,...,...,...
27273,131254,kein bund für's leben,2007,Comedy
27274,131256,"feuer, eis & dosenbier",2002,Comedy
27275,131258,the pirates,2014,Adventure
27276,131260,rentun ruusu,2001,(no genres listed)


In [4]:
movies_df = movies_df.drop_duplicates(subset='title', keep='first')

# Join the ratings to the genres
ratings_genres_df = pd.merge(movies_df, ratings_df,  how='inner', left_on=['item'], right_on = ['item'])
ratings_genres_df = ratings_genres_df[['item', 'user', 'rating', 'title', 'genres']]

# keep only the user and genres
user_genres = ratings_genres_df.loc[:, ['user', 'genres']]

# Create a user only dataframe
user_df = user_genres.drop_duplicates(subset=['user'], keep='first')
user_df = user_df['user']
user_df = pd.DataFrame(user_df)
user_df = user_df.set_index('user')
user_df = user_df.sort_index(ascending=True)

user_genres = user_genres.set_index('user')
user_genres = user_genres.sort_values(by="user", ascending=True)

display(user_genres)
display(user_df)

Unnamed: 0_level_0,genres
user,Unnamed: 1_level_1
1,Adventure|Animation|Fantasy|Sci-Fi
1,Action|Adventure
1,Action|Western
1,Drama|Western
1,Adventure|Comedy|Fantasy
...,...
138493,Comedy
138493,Adventure|Animation|Children|Comedy
138493,Adventure|Animation|Children|Comedy
138493,Animation|Children|Drama|Musical|Romance


1
2
3
4
5
...
138489
138490
138491
138492
138493


In [5]:
import numpy as np

def count_rated_genres(user_id, df):

    user_genre = df.loc[user_id,:]
    rated_genres = set()

    for genre in user_genre.genres:
        rated_genres.update(genre.split('|'))

    return len(rated_genres)

user_genres_rated = []
for user in user_df.index:
     user_genres_rated.append(count_rated_genres(user, user_genres))


In [6]:
user_genres_rated_df = pd.DataFrame({
    'user' : user_df.index.tolist(),
    'genres_rated' : user_genres_rated
})

display(user_genres_rated_df)

mean_genres_rated = np.mean(user_genres_rated)
print(mean_genres_rated)
user_genres_rated_df = user_genres_rated_df[user_genres_rated_df['genres_rated'] > mean_genres_rated]

display(user_genres_rated_df)

Unnamed: 0,user,genres_rated
0,1,17
1,2,18
2,3,18
3,4,15
4,5,17
...,...,...
138488,138489,17
138489,138490,18
138490,138491,14
138491,138492,17


16.360133725170225


Unnamed: 0,user,genres_rated
0,1,17
1,2,18
2,3,18
4,5,17
6,7,18
...,...,...
138486,138487,17
138488,138489,17
138489,138490,18
138491,138492,17


In [7]:
count_ratings = ratings_df.groupby(['user']).count()
selected = count_ratings['rating'] > count_ratings['rating'].mean()

expert_users = count_ratings.loc[selected]
expert_users = pd.DataFrame(expert_users)
expert_users = expert_users.reset_index()
expert_users = expert_users.merge(user_genres_rated_df, how='inner', left_on='user', right_on='user')
expert_users = expert_users.drop(['timestamp', 'item'], axis=1)
expert_users.rename(columns={'rating' : 'num_ratings'}, inplace=True)

display(expert_users)

Unnamed: 0,user,num_ratings,genres_rated
0,1,175,17
1,3,187,18
2,7,276,18
3,11,504,19
4,14,243,17
...,...,...,...
35870,138483,276,19
35871,138484,148,18
35872,138486,193,18
35873,138490,151,18


In [9]:

genre_set = set()
for genres in ratings_genres_df.genres:
    genre_set.update(genres.split('|'))


In [10]:
genre_df = ratings_genres_df[['item', 'genres']]
genre_df = pd.DataFrame(genre_df)

i = 2
for genre in genre_set:
    genre_df.insert(i, genre, False)
    i += 1

genre_df = genre_df.drop_duplicates(subset='item', keep='first')
genre_df = genre_df.set_index('item')

display(genre_df)

Unnamed: 0_level_0,genres,Fantasy,Romance,Sci-Fi,Adventure,Drama,Crime,Horror,Comedy,War,...,Western,Children,Musical,Action,Thriller,Documentary,IMAX,Mystery,Film-Noir,Animation
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Adventure|Animation|Children|Comedy|Fantasy,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,Adventure|Children|Fantasy,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,Comedy|Romance,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,Comedy|Drama|Romance,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5,Comedy,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131252,Comedy|Horror,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
131254,Comedy,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
131256,Comedy,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
131258,Adventure,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [11]:

for item in genre_df.index:
    genres = genre_df.loc[item, 'genres']
    genre_list = genres.split('|')
    for genre in genre_list:
        genre_df.at[item, genre] = True

genre_df = genre_df.drop(['genres'], axis=1)

display(genre_df)

Unnamed: 0_level_0,Fantasy,Romance,Sci-Fi,Adventure,Drama,Crime,Horror,Comedy,War,(no genres listed),Western,Children,Musical,Action,Thriller,Documentary,IMAX,Mystery,Film-Noir,Animation
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,True,False,False,True,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,True
2,True,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False
3,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False
4,False,True,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131252,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False
131254,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False
131256,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False
131258,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [13]:
tags_df = pd.DataFrame(pd.read_csv(data_folder + "/tags.csv"))
tags_df = tags_df.rename(columns={'movieId': 'item'})
tags_df = tags_df.drop(['timestamp', 'userId'], axis=1)
tags_df = tags_df.sort_values(by="item", ascending=True)
tags_df = tags_df.set_index('item')
# tags_df.index = np.arrange(len(tags_df.index))
display(tags_df)

Unnamed: 0_level_0,tag
item,Unnamed: 1_level_1
1,friendship
1,animation
1,animated
1,adventure
1,animation
...,...
131258,Korea
131258,bandits
131258,whale
131258,pirates


In [14]:
tags_set = set()
for tag in tags_df.tag:
    tag = repr(tag).strip().lower().replace('\'', '')
    tag = tag.replace('"', '')
    tags_set.add(tag)

print(tags_set)



In [32]:
random_user = expert_users.sample()
random_user = random_user['user']
random_user_ratings = pd.merge(random_user, ratings_genres_df, how='left', left_on='user', right_on='user')

favourite_movies = random_user_ratings.loc[random_user_ratings['rating'] == np.max(random_user_ratings['rating'])]

display(favourite_movies)

Unnamed: 0,user,item,rating,title,genres
3,44763,32,5.0,twelve monkeys (a.k.a. 12 monkeys),Mystery|Sci-Fi|Thriller
5,44763,50,5.0,"usual suspects, the",Crime|Mystery|Thriller
11,44763,206,5.0,unzipped,Documentary
14,44763,260,5.0,star wars: episode iv - a new hope,Action|Adventure|Sci-Fi
15,44763,265,5.0,like water for chocolate (como agua para choco...,Drama|Fantasy|Romance
18,44763,296,5.0,pulp fiction,Comedy|Crime|Drama|Thriller
20,44763,306,5.0,three colors: red (trois couleurs: rouge),Drama
22,44763,318,5.0,"shawshank redemption, the",Crime|Drama
30,44763,446,5.0,farewell my concubine (ba wang bie ji),Drama|Romance
32,44763,508,5.0,philadelphia,Drama


In [42]:
genre_set_list = list(genre_set)
genre_count = np.zeros(len(genre_set_list))

for genre in favourite_movies.genres:
    genre_list = genre.split('|')
    for i in range(len(genre_list)):
        for j in range(len(genre_set_list)):
            if genre_list[i] == genre_set_list[j]:
                genre_count[j] += 1


def find_favourite_genres(num_genres=3):
    favourite_genres = []
    while len(favourite_genres) < num_genres:
        if np.max(genre_count) == 0:
            break
        fav_genre = genre_set_list[np.argmax(genre_count)]
        genre_count[np.argmax(genre_count)] = 0
        if fav_genre == "no genres listed)":
            continue
        favourite_genres.append(fav_genre)
        

    return favourite_genres

print()

['Drama', 'Comedy', 'Adventure']


Unnamed: 0,user,item,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580
...,...,...,...,...
20000258,138493,68954,4.5,1258126920
20000259,138493,69526,4.5,1259865108
20000260,138493,69644,3.0,1260209457
20000261,138493,70286,5.0,1258126944
