In [1]:
import pandas as pd
import numpy as np
import datetime
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
genome_scores_data = pd.read_csv('movielens-latest-full/genome-scores.csv') 
movies_data = pd.read_csv('movielens-latest-full/movies.csv') 
ratings_data = pd.read_csv('movielens-latest-full/ratings.csv')

In [5]:
genome_scores_data.head()

Unnamed: 0,movieId,tagId,relevance
0,1,1,0.029
1,1,2,0.02375
2,1,3,0.05425
3,1,4,0.06875
4,1,5,0.16


In [6]:
movies_data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
ratings_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,307,3.5,1256677221
1,1,481,3.5,1256677456
2,1,1091,1.5,1256677471
3,1,1257,4.5,1256677460
4,1,1449,4.5,1256677264


In [8]:
scores_pivot = genome_scores_data.pivot_table(index = ["movieId"],columns = ["tagId"],values = "relevance").reset_index()
scores_pivot.head()

tagId,movieId,1,2,3,4,5,6,7,8,9,...,1119,1120,1121,1122,1123,1124,1125,1126,1127,1128
0,1,0.029,0.02375,0.05425,0.06875,0.16,0.19525,0.076,0.252,0.2275,...,0.03775,0.0225,0.04075,0.03175,0.1295,0.0455,0.02,0.0385,0.09125,0.02225
1,2,0.03625,0.03625,0.08275,0.08175,0.102,0.069,0.05775,0.101,0.08225,...,0.04775,0.0205,0.0165,0.0245,0.1305,0.027,0.01825,0.01225,0.09925,0.0185
2,3,0.0415,0.0495,0.03,0.09525,0.04525,0.05925,0.04,0.1415,0.04075,...,0.058,0.02375,0.0355,0.02125,0.12775,0.0325,0.01625,0.02125,0.09525,0.0175
3,4,0.0335,0.03675,0.04275,0.02625,0.0525,0.03025,0.02425,0.07475,0.0375,...,0.049,0.03275,0.02125,0.03675,0.15925,0.05225,0.015,0.016,0.09175,0.015
4,5,0.0405,0.05175,0.036,0.04625,0.055,0.08,0.0215,0.07375,0.02825,...,0.05375,0.02625,0.0205,0.02125,0.17725,0.0205,0.015,0.0155,0.08875,0.01575


In [10]:
#join
mov_tag_df = movies_data.merge(scores_pivot, left_on='movieId', right_on='movieId', how='left')
mov_tag_df = mov_tag_df.fillna(0) 
mov_tag_df = mov_tag_df.drop(['title','genres'], axis = 1)
mov_tag_df.head()

Unnamed: 0,movieId,1,2,3,4,5,6,7,8,9,...,1119,1120,1121,1122,1123,1124,1125,1126,1127,1128
0,1,0.029,0.02375,0.05425,0.06875,0.16,0.19525,0.076,0.252,0.2275,...,0.03775,0.0225,0.04075,0.03175,0.1295,0.0455,0.02,0.0385,0.09125,0.02225
1,2,0.03625,0.03625,0.08275,0.08175,0.102,0.069,0.05775,0.101,0.08225,...,0.04775,0.0205,0.0165,0.0245,0.1305,0.027,0.01825,0.01225,0.09925,0.0185
2,3,0.0415,0.0495,0.03,0.09525,0.04525,0.05925,0.04,0.1415,0.04075,...,0.058,0.02375,0.0355,0.02125,0.12775,0.0325,0.01625,0.02125,0.09525,0.0175
3,4,0.0335,0.03675,0.04275,0.02625,0.0525,0.03025,0.02425,0.07475,0.0375,...,0.049,0.03275,0.02125,0.03675,0.15925,0.05225,0.015,0.016,0.09175,0.015
4,5,0.0405,0.05175,0.036,0.04625,0.055,0.08,0.0215,0.07375,0.02825,...,0.05375,0.02625,0.0205,0.02125,0.17725,0.0205,0.015,0.0155,0.08875,0.01575


In [11]:
def set_genres(genres,col):
    if genres in col.split('|'): return 1
    else: return 0

In [12]:
mov_genres_df["Action"] = mov_genres_df.apply(lambda x: set_genres("Action",x['genres']), axis=1)
mov_genres_df["Adventure"] = mov_genres_df.apply(lambda x: set_genres("Adventure",x['genres']), axis=1)
mov_genres_df["Animation"] = mov_genres_df.apply(lambda x: set_genres("Animation",x['genres']), axis=1)
mov_genres_df["Children"] = mov_genres_df.apply(lambda x: set_genres("Children",x['genres']), axis=1)
mov_genres_df["Comedy"] = mov_genres_df.apply(lambda x: set_genres("Comedy",x['genres']), axis=1)
mov_genres_df["Crime"] = mov_genres_df.apply(lambda x: set_genres("Crime",x['genres']), axis=1)
mov_genres_df["Documentary"] = mov_genres_df.apply(lambda x: set_genres("Documentary",x['genres']), axis=1)
mov_genres_df["Drama"] = mov_genres_df.apply(lambda x: set_genres("Drama",x['genres']), axis=1)
mov_genres_df["Fantasy"] = mov_genres_df.apply(lambda x: set_genres("Fantasy",x['genres']), axis=1)
mov_genres_df["Film-Noir"] = mov_genres_df.apply(lambda x: set_genres("Film-Noir",x['genres']), axis=1)
mov_genres_df["Horror"] = mov_genres_df.apply(lambda x: set_genres("Horror",x['genres']), axis=1)
mov_genres_df["Musical"] = mov_genres_df.apply(lambda x: set_genres("Musical",x['genres']), axis=1)
mov_genres_df["Mystery"] = mov_genres_df.apply(lambda x: set_genres("Mystery",x['genres']), axis=1)
mov_genres_df["Romance"] = mov_genres_df.apply(lambda x: set_genres("Romance",x['genres']), axis=1)
mov_genres_df["Sci-Fi"] = mov_genres_df.apply(lambda x: set_genres("Sci-Fi",x['genres']), axis=1)
mov_genres_df["Thriller"] = mov_genres_df.apply(lambda x: set_genres("Thriller",x['genres']), axis=1)
mov_genres_df["War"] = mov_genres_df.apply(lambda x: set_genres("War",x['genres']), axis=1)
mov_genres_df["Western"] = mov_genres_df.apply(lambda x: set_genres("Western",x['genres']), axis=1)
mov_genres_df["(no genres listed)"] = mov_genres_df.apply(lambda x: set_genres("(no genres listed)",x['genres']), axis=1)

NameError: name 'mov_genres_df' is not defined