In [None]:
from IPython.display import display
import pandas as pd

data_folder = "./ml-20m"

ratings_df = pd.read_csv(data_folder + "/ratings.csv")
movies_df = pd.read_csv(data_folder + "/movies.csv")

ratings_df = ratings_df.rename(columns={'userId': 'user', 'movieId': 'item'})
movies_df = movies_df.rename(columns={'movieId': 'item', 'title': 'title_original'})

In [None]:
def compute_title_year(row):
    # read the title of the specified row, and cast it to the str type,
    # then use the strip() method to eliminate the spaces from the beginning and the end of the string
    title_original = str(row['title_original']).strip()

    # select the substring from the fifth-to-last (included) character to the last (excluded)
    year = title_original[-5:-1]

    # use the isdigit method to determine if the substring selected is a number
    if year.isdigit():
        # remove the year from the title
        # select the substring from the beninning to the sixth-to-last (excluded)
        # then use the strip() method to remove spaces at the beginning and at the end of the string
        # finally, use the lower() method to obtain the title in lower case
        title = title_original[:-6].strip().lower()

        # return the year and the title
        return int(year), title
    else:
        # nothing to do, return 0 as year and the title
        return 0, title_original.lower()


movies_df[['year', 'title']] = movies_df.apply(compute_title_year, axis=1, result_type='expand')
movies_df = movies_df[['item', 'title', 'year', 'genres']]
display(movies_df)

In [None]:
movies_df = movies_df.drop_duplicates(subset='title', keep='first')

# Join the ratings to the genres
ratings_genres_df = pd.merge(movies_df, ratings_df,  how='inner', left_on=['item'], right_on = ['item'])
ratings_genres_df = ratings_genres_df[['item', 'user', 'rating', 'title', 'genres']]

# keep only the user and genres
user_genres = ratings_genres_df.loc[:, ['user', 'genres']]

# Create a user only dataframe
user_df = user_genres.drop_duplicates(subset=['user'], keep='first')
user_df = user_df['user']
user_df = pd.DataFrame(user_df)
user_df = user_df.set_index('user')
user_df = user_df.sort_index(ascending=True)

user_genres = user_genres.set_index('user')
user_genres = user_genres.sort_values(by="user", ascending=True)

display(user_genres)
display(user_df)
display(ratings_df)

In [4]:
import numpy as np

def count_rated_genres(user_id, df):

    user_genre = df.loc[user_id,:]
    rated_genres = set()

    for genre in user_genre.genres:
        rated_genres.update(genre.split('|'))

    return len(rated_genres)

user_genres_rated = []
for user in user_df.index:
     user_genres_rated.append(count_rated_genres(user, user_genres))


In [5]:
user_genres_rated_df = pd.DataFrame({
    'user' : user_df.index.tolist(),
    'genres_rated' : user_genres_rated
})

display(user_genres_rated_df)

mean_genres_rated = np.mean(user_genres_rated)
print(mean_genres_rated)
user_genres_rated_df = user_genres_rated_df[user_genres_rated_df['genres_rated'] > mean_genres_rated]

display(user_genres_rated_df)

Unnamed: 0,user,genres_rated
0,1,17
1,2,18
2,3,18
3,4,15
4,5,17
...,...,...
138488,138489,17
138489,138490,18
138490,138491,14
138491,138492,17


16.360133725170225


Unnamed: 0,user,genres_rated
0,1,17
1,2,18
2,3,18
4,5,17
6,7,18
...,...,...
138486,138487,17
138488,138489,17
138489,138490,18
138491,138492,17


In [6]:
count_ratings = ratings_df.groupby(['user']).count()
selected = count_ratings['rating'] > count_ratings['rating'].mean()

expert_users = count_ratings.loc[selected]
expert_users = pd.DataFrame(expert_users)
expert_users = expert_users.reset_index()
expert_users = expert_users.merge(user_genres_rated_df, how='inner', left_on='user', right_on='user')
expert_users = expert_users.drop(['timestamp', 'item'], axis=1)
expert_users.rename(columns={'rating' : 'num_ratings'}, inplace=True)

display(expert_users)

Unnamed: 0,user,num_ratings,genres_rated
0,1,175,17
1,3,187,18
2,7,276,18
3,11,504,19
4,14,243,17
...,...,...,...
35870,138483,276,19
35871,138484,148,18
35872,138486,193,18
35873,138490,151,18


In [8]:

genre_set = set()
for genres in ratings_genres_df.genres:
    genre_set.update(genres.split('|'))


In [9]:
genre_df = ratings_genres_df[['item', 'genres']]
genre_df = pd.DataFrame(genre_df)

i = 2
for genre in genre_set:
    genre_df.insert(i, genre, False)
    i += 1

genre_df = genre_df.drop_duplicates(subset='item', keep='first')
genre_df = genre_df.set_index('item')

display(genre_df)

Unnamed: 0_level_0,genres,Animation,Fantasy,War,Romance,Action,Sci-Fi,Musical,Thriller,Mystery,...,(no genres listed),Crime,Film-Noir,Drama,Adventure,Horror,Western,Documentary,IMAX,Children
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Adventure|Animation|Children|Comedy|Fantasy,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,Adventure|Children|Fantasy,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,Comedy|Romance,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,Comedy|Drama|Romance,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5,Comedy,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131252,Comedy|Horror,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
131254,Comedy,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
131256,Comedy,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
131258,Adventure,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [10]:

for item in genre_df.index:
    genres = genre_df.loc[item, 'genres']
    genre_list = genres.split('|')
    for genre in genre_list:
        genre_df.at[item, genre] = True

genre_df = genre_df.drop(['genres'], axis=1)
genre_df = genre_df.reset_index()

display(genre_df)

Unnamed: 0,item,Animation,Fantasy,War,Romance,Action,Sci-Fi,Musical,Thriller,Mystery,...,(no genres listed),Crime,Film-Noir,Drama,Adventure,Horror,Western,Documentary,IMAX,Children
0,1,True,True,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,True
1,2,False,True,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,True
2,3,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,4,False,False,False,True,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
4,5,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25700,131252,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
25701,131254,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
25702,131256,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
25703,131258,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False


In [11]:
tags_df = pd.DataFrame(pd.read_csv(data_folder + "/tags.csv"))
tags_df = tags_df.rename(columns={'movieId': 'item'})
tags_df = tags_df.drop(['timestamp', 'userId'], axis=1)
tags_df = tags_df.sort_values(by="item", ascending=True)
tags_df = tags_df.set_index('item')
# tags_df.index = np.arrange(len(tags_df.index))
display(tags_df)

Unnamed: 0_level_0,tag
item,Unnamed: 1_level_1
1,friendship
1,animation
1,animated
1,adventure
1,animation
...,...
131258,Korea
131258,bandits
131258,whale
131258,pirates


Method to find power set of favourite movies

In [22]:
def clean_tags(row):
    return str(row['tag']).strip().lower()

tags_df['tag'] = tags_df.apply(clean_tags, axis=1, result_type='expand')
display(tags_df)

Unnamed: 0_level_0,tag
item,Unnamed: 1_level_1
1,friendship
1,animation
1,animated
1,adventure
1,animation
...,...
131258,korea
131258,bandits
131258,whale
131258,pirates


In [41]:
average_rating = {}
for item in list(movies_df.item):
    item_rating = ratings_df.loc[ratings_df['item'] == item]
    average_rating[item] = round(np.mean(item_rating.rating), 1)


display(average_rating)

{1: 3.9,
 2: 3.2,
 3: 3.2,
 4: 2.9,
 5: 3.1,
 6: 3.8,
 7: 3.4,
 8: 3.1,
 9: 3.0,
 10: 3.4,
 11: 3.7,
 12: 2.6,
 13: 3.3,
 14: 3.4,
 15: 2.7,
 16: 3.8,
 17: 4.0,
 18: 3.4,
 19: 2.6,
 20: 2.9,
 21: 3.6,
 22: 3.3,
 23: 3.1,
 24: 3.2,
 25: 3.7,
 26: 3.6,
 27: 3.4,
 28: 4.1,
 29: 4.0,
 30: 3.6,
 31: 3.3,
 32: 3.9,
 33: 3.0,
 34: 3.6,
 35: 3.5,
 36: 3.9,
 37: 3.0,
 38: 2.9,
 39: 3.4,
 40: 3.6,
 41: 3.9,
 42: 3.1,
 43: 3.5,
 44: 2.7,
 45: 3.4,
 46: 3.3,
 47: 4.1,
 48: 2.9,
 49: 3.6,
 50: 4.3,
 51: 2.6,
 52: 3.5,
 53: 3.7,
 54: 2.9,
 55: 3.2,
 56: 1.9,
 57: 3.3,
 58: 4.0,
 59: 3.6,
 60: 3.1,
 61: 3.3,
 62: 3.7,
 63: 3.1,
 64: 2.8,
 65: 2.4,
 66: 2.1,
 67: 3.4,
 68: 3.4,
 69: 3.6,
 70: 3.3,
 71: 2.4,
 72: 3.5,
 73: 3.8,
 74: 3.3,
 75: 2.5,
 76: 3.0,
 77: 3.4,
 78: 3.2,
 79: 3.1,
 80: 3.7,
 81: 3.3,
 82: 4.0,
 83: 3.7,
 84: 3.4,
 85: 3.5,
 86: 3.5,
 87: 2.4,
 88: 3.1,
 89: 3.1,
 90: 3.4,
 92: 2.9,
 93: 2.5,
 94: 3.5,
 95: 3.1,
 96: 3.7,
 97: 4.0,
 98: 2.5,
 99: 3.1,
 100: 3.2,
 101: 3.8,
 102: 2

In [None]:
average_ratings_df = pd.DataFrame.from_dict(average_rating, orient ='index')
average_ratings_df.index.names = ['item']
average_ratings_df.columns = ['average_rating']

display(average_ratings_df)

In [59]:
preprocessed_dataset_folder = "./processed_data"

tags_df = tags_df.reset_index()

ratings_df.to_csv(preprocessed_dataset_folder + "/ratings.csv", index=False)
movies_df.to_csv(preprocessed_dataset_folder + "/movies.csv", index=False)

expert_users.to_csv(preprocessed_dataset_folder + "/expert_users.csv", index=False)
ratings_genres_df.to_csv(preprocessed_dataset_folder + "/ratings_genres.csv", index=False)

genre_df.to_csv(preprocessed_dataset_folder + "/genre.csv", index=False)
tags_df.to_csv(preprocessed_dataset_folder + "/tags.csv", index=False)

average_ratings_df.to_csv(preprocessed_dataset_folder + "/average_ratings.csv", index=True)