In [8]:
from IPython.display import display
import matplotlib.pyplot as plt
import pandas as pd

data_folder = "./ml-20m"

ratings_df = pd.read_csv(data_folder + "/ratings.csv")
movies_df = pd.read_csv(data_folder + "/movies.csv")

ratings_df = ratings_df.rename(columns={'userId': 'user', 'movieId': 'item'})
movies_df = movies_df.rename(columns={'movieId': 'item', 'title': 'title_original'})

In [9]:
def compute_title_year(row):
    # read the title of the specified row, and cast it to the str type,
    # then use the strip() method to eliminate the spaces from the beginning and the end of the string
    title_original = str(row['title_original']).strip()

    # select the substriIng from the fifth-to-last (included) character to the last (excluded)
    year = title_original[-5:-1]

    # use the isdigit method to determine if the substring selected is a number
    if year.isdigit():
        # remove the year from the title
        # select the substring from the beninning to the sixth-to-last (excluded)
        # then use the strip() method to remove spaces at the beginning and at the end of the string
        # finally, use the lower() method to obtain the title in lower case
        title = title_original[:-6].strip().lower()

        # return the year and the title
        return int(year), title
    else:
        # nothing to do, return 0 as year and the title
        return 0, title_original.lower()


movies_df[['year', 'title']] = movies_df.apply(compute_title_year, axis=1, result_type='expand')
movies_df = movies_df[['item', 'title', 'year', 'genres']]
display(movies_df)

Unnamed: 0,item,title,year,genres
0,1,toy story,1995,Adventure|Animation|Children|Comedy|Fantasy
1,2,jumanji,1995,Adventure|Children|Fantasy
2,3,grumpier old men,1995,Comedy|Romance
3,4,waiting to exhale,1995,Comedy|Drama|Romance
4,5,father of the bride part ii,1995,Comedy
...,...,...,...,...
27273,131254,kein bund für's leben,2007,Comedy
27274,131256,"feuer, eis & dosenbier",2002,Comedy
27275,131258,the pirates,2014,Adventure
27276,131260,rentun ruusu,2001,(no genres listed)


In [10]:
movies_df = movies_df.drop_duplicates(subset='title', keep='first')

# Join the ratings to the genres
ratings_genres_df = pd.merge(movies_df, ratings_df,  how='inner', left_on=['item'], right_on = ['item'])
ratings_genres_df = ratings_genres_df[['item', 'user', 'rating', 'title', 'genres']]

# keep only the user and genres
user_genres = ratings_genres_df.loc[:, ['user', 'genres']]

# Create a user only dataframe
user_df = user_genres.drop_duplicates(subset=['user'], keep='first')
user_df = user_df['user']
user_df = pd.DataFrame(user_df)
user_df = user_df.set_index('user')
user_df = user_df.sort_index(ascending=True)

user_genres = user_genres.set_index('user')
user_genres = user_genres.sort_values(by="user", ascending=True)

display(user_genres)
display(user_df)

Unnamed: 0_level_0,genres
user,Unnamed: 1_level_1
1,Adventure|Animation|Fantasy|Sci-Fi
1,Action|Adventure
1,Action|Western
1,Drama|Western
1,Adventure|Comedy|Fantasy
...,...
138493,Comedy
138493,Adventure|Animation|Children|Comedy
138493,Adventure|Animation|Children|Comedy
138493,Animation|Children|Drama|Musical|Romance


1
2
3
4
5
...
138489
138490
138491
138492
138493


In [11]:
import numpy as np

def count_rated_genres(user_id, df):

    user_genre = df.loc[user_id,:]
    rated_genres = set()

    for genre in user_genre.genres:
        rated_genres.update(genre.split('|'))

    return len(rated_genres)

user_genres_rated = []
for user in user_df.index:
     user_genres_rated.append(count_rated_genres(user, user_genres))


In [12]:
user_genres_rated_df = pd.DataFrame({
    'user' : user_df.index.tolist(),
    'genres_rated' : user_genres_rated
})

display(user_genres_rated_df)

mean_genres_rated = np.mean(user_genres_rated)
print(mean_genres_rated)
user_genres_rated_df = user_genres_rated_df[user_genres_rated_df['genres_rated'] > mean_genres_rated]

display(user_genres_rated_df)

Unnamed: 0,user,genres_rated
0,1,17
1,2,18
2,3,18
3,4,15
4,5,17
...,...,...
138488,138489,17
138489,138490,18
138490,138491,14
138491,138492,17


16.360133725170225


Unnamed: 0,user,genres_rated
0,1,17
1,2,18
2,3,18
4,5,17
6,7,18
...,...,...
138486,138487,17
138488,138489,17
138489,138490,18
138491,138492,17


In [13]:
count_ratings = ratings_df.groupby(['user']).count()
selected = count_ratings['rating'] > count_ratings['rating'].mean()

expert_users = count_ratings.loc[selected]
expert_users = pd.DataFrame(expert_users)
expert_users = expert_users.reset_index()
expert_users = expert_users.merge(user_genres_rated_df, how='inner', left_on='user', right_on='user')
expert_users = expert_users.drop(['timestamp', 'item'], axis=1)
expert_users.rename(columns={'rating' : 'num_ratings'}, inplace=True)

display(expert_users)

Unnamed: 0,user,num_ratings,genres_rated
0,1,175,17
1,3,187,18
2,7,276,18
3,11,504,19
4,14,243,17
...,...,...,...
35870,138483,276,19
35871,138484,148,18
35872,138486,193,18
35873,138490,151,18
