In [1]:
import pandas as pd
import numpy as np
movie_lens_path = "/Users/david/Documents/Research2324/Sanner/ml-25m/"

In [2]:
movie_df = pd.read_csv(movie_lens_path + "movies.csv")
movie_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
tags_df = pd.read_csv(movie_lens_path + "tags.csv")
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,3,260,classic,1439472355
1,3,260,sci-fi,1439472256
2,4,1732,dark comedy,1573943598
3,4,1732,great dialogue,1573943604
4,4,7569,so bad it's good,1573943455


In [4]:
# Trim to movies with exactly 6 tags
counts = tags_df['movieId'].value_counts()
trimmed_tags = tags_df[tags_df['movieId'].isin(counts.index[counts == 6])]
trimmed_tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
654,195,184685,impressive visuals,1545130234
655,195,184685,magic,1545130222
656,195,184685,mystery,1545130257
657,195,184685,tang dynasty,1545130187
1200,402,116803,china,1431427396


In [5]:
# USE THIS OPTION IF YOU WANT A RANDOM SAMPLE OF MOVIES WITH A SPECIFIC NUMBER OF TAGS

# Select 20 random movies from the trimmed set
np.random.seed(42)
random_movies = np.random.choice(trimmed_tags['movieId'].unique(), size=20)
movie_subset = random_movies


In [6]:
# USE THIS OPTION FOR HANDSELECTED MOVIES
selected_titles = [
    "Terminator, The (1984)", # 4 sci-fi movies
    "Matrix, The (1999)",
    "Interstellar (2014)",
    "Alien (1979)",
    "Tropic Thunder (2008)", # 4 comedy movies
    "Night at the Museum (2006)",
    "Shaun of the Dead (2004)",
    "Hot Fuzz (2007)",
    "Godfather, The (1972)", # 4 crime movies
    "Heat (1995)",
    "Goodfellas (1990)",
    "Reservoir Dogs (1992)",
    "Fantasia (1940)", # 4 animated movies
    "WALL-E (2008)",
    "Aladdin (1992)",
    "Shrek (2001)",
]
movie_rows = movie_df[movie_df['title'].isin(selected_titles)]
movie_subset = movie_rows['movieId']
movie_rows

Unnamed: 0,movieId,title,genres
5,6,Heat (1995),Action|Crime|Thriller
580,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical
840,858,"Godfather, The (1972)",Crime|Drama
1062,1089,Reservoir Dogs (1992),Crime|Mystery|Thriller
1182,1213,Goodfellas (1990),Crime|Drama
1183,1214,Alien (1979),Horror|Sci-Fi
1207,1240,"Terminator, The (1984)",Action|Sci-Fi|Thriller
1249,1282,Fantasia (1940),Animation|Children|Fantasy|Musical
2480,2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller
4201,4306,Shrek (2001),Adventure|Animation|Children|Comedy|Fantasy|Ro...


In [7]:
final_tags = tags_df[tags_df['movieId'].isin(movie_subset)]
final_tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
51,87,109487,good science,1522676693
52,87,109487,Hans Zimmer,1522676679
53,87,109487,philosophical issues,1522676687
54,87,109487,sci-fi,1522676660
55,87,109487,science fiction,1522676703


In [8]:
test_row = final_tags[final_tags['movieId'] == 1666]
for tag in test_row['tag']:
    print(tag)

In [13]:
item_descs = {}
for item_count, item_id in enumerate(movie_subset):
    item_desc = ""
    # Append genres to the description
    movie_df_row = movie_df[movie_df['movieId'] == item_id]
    title = movie_df_row['title'].item()
    genres = movie_df_row['genres'].item()
    genres = genres.split("|")
    for genre in genres:
        item_desc += (genre + ", ")

    # Append tags to the description
    movie_tags = final_tags[final_tags['movieId'] == item_id].drop_duplicates("tag")
    for i, aspect in enumerate(movie_tags['tag']):
        item_desc += aspect
        if (i > 15): # Cap the number of aspects at 15
            break
        if not (i == (len(movie_tags) - 1)):
            item_desc += ", "
    item_descs[str(item_count)] = {"description": item_desc, "name": title} # Using item_count instead of item_id 
print(item_descs)


{'0': {'description': 'Action, Crime, Thriller, imdb top 250, great acting, realistic action, suspense, Al Pacino, atmospheric, bank robbery, crime, Robert De Niro, tense, Val Kilmer, bank job, dialogue, heist, heist movie, long, Los Angeles', 'name': 'Heat (1995)'}, '1': {'description': 'Adventure, Animation, Children, Comedy, Musical, Katottava, aftercreditsstinger, animal sidekick, animation, arab, arabian nights, cartoon, comedy, flying carpet, genie, love, magic, magic lamp, monkey, musical, parrot, princess', 'name': 'Aladdin (1992)'}, '2': {'description': 'Crime, Drama, italian mafia, italy, Mafia, Marlon Brando, masterpiece, imdb top 250, Oscar (Best Picture), based on a book, catchy theme, organized crime, visceral, 100 Greatest Movies, classic, great acting, mafia, Al Pacino, epic', 'name': 'Godfather, The (1972)'}, '3': {'description': 'Crime, Mystery, Thriller, gangster, organized crime, violence, stylish, imdb top 250, violent, cult, diamonds, guns, nonlinear, original, ro

In [12]:
import json

with open("./data/movielens_16_trimmed.json", "w") as output_file:
    json.dump(item_descs, output_file)