In [1]:
import pandas as pd
import numpy as np

In [2]:


# Enter path to data folder on your device. 
# Data can be downloaded from https://grouplens.org/datasets/movielens/ as "ml-25m.zip" under MovieLens 25M Dataset
# ~62K movies in total
movie_lens_path = "C:/Users/anton/source/data/ml_25m/"

In [3]:
movie_df = pd.read_csv(movie_lens_path + "movies.csv")

#Make 'genres' column comma seperated, not '|' seperated
movie_df['genres'] = movie_df['genres'].str.replace('|', ', ', regex=False)

movie_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"Adventure, Animation, Children, Comedy, Fantasy"
1,2,Jumanji (1995),"Adventure, Children, Fantasy"
2,3,Grumpier Old Men (1995),"Comedy, Romance"
3,4,Waiting to Exhale (1995),"Comedy, Drama, Romance"
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
tags_df = pd.read_csv(movie_lens_path + "tags.csv")
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,3,260,classic,1439472355
1,3,260,sci-fi,1439472256
2,4,1732,dark comedy,1573943598
3,4,1732,great dialogue,1573943604
4,4,7569,so bad it's good,1573943455


### Filter movies to have 10+ tags

In [5]:
# Load tags
tags_df = pd.read_csv(movie_lens_path + "tags.csv")

# Group by 'movieId' and count unique tags
unique_tag_counts = tags_df.groupby("movieId")["tag"].nunique()

# Select movies with at least 10 unique tags
movies_10_plus_tags = unique_tag_counts[unique_tag_counts >= 10].index.tolist()


In [6]:
#gives 4425 movies (using MovieLens25M)
len(movies_10_plus_tags)

13296

### Randomly select 100 movies and get their top 20 most frequent tags

In [None]:
import random

In [None]:
random.seed(42)
random_100_movie_ids = random.sample(movies_10_plus_tags, 1000)

In [None]:
#Filter 'tags_df' for these 100 movies
filtered_100_tags_df = tags_df[tags_df['movieId'].isin(random_100_movie_ids)]

#tag counts for each movie:
grouped_tags = filtered_100_tags_df.groupby(['movieId', 'tag']).size().reset_index(name='count')

# for each movie, select the top 20 tags
top_20_tags_per_movie = grouped_tags.groupby('movieId').apply(lambda x: x.nlargest(20, 'count')).reset_index(drop = True)

In [None]:
top_20_tags_per_movie

In [None]:
# Create a DataFrame from the 'top_20_tags_per_movie' with 'movieId' as index and concatenated tags as values
tags_concatenated = top_20_tags_per_movie.groupby('movieId')['tag'].apply(lambda x: ', '.join(x)).reset_index(name='top_20_tags')

#Merge the concatenated tags with the filtered_movie_df
filtered_movie_df = movie_df[movie_df['movieId'].isin(random_100_movie_ids)]
filtered_100_df = pd.merge(filtered_movie_df, tags_concatenated, on='movieId', how='left')

#randomize rows
filtered_100_df = filtered_100_df.sample(frac=1, random_state = 10).reset_index(drop=True)

In [None]:
filtered_100_df

### Make JSON File

In [None]:
# If from scratch:
movies_dict = {}

for idx, row in filtered_100_df.iterrows():
   description = f" id: {idx} \n Movie Title: {row['title']} \n Genres: {row['genres']} \n Tags: {row['top_20_tags']}"
   movies_dict[str(idx)] = {'description': description}

import json

with open('data/ml25M_1000_movie_sample_seed_10.json', 'w') as json_file:
   json.dump(movies_dict, json_file, indent=4)

In [None]:
import json
#initial movie order
with open('data/ml25M_100_movie_sample.json', 'r') as json_file:
    initial_sample_100 = json.load(json_file)

In [None]:
#make list of movie titles
titles_ordered_initial_100 = [movie['description'].split('\n')[0].split('Title: ')[1].strip() for movie in initial_sample_100.values()]

In [None]:
movies_dict = {}

for idx, title in enumerate(titles_ordered_initial_100):
    row = filtered_100_df[filtered_100_df['title'] == title].iloc[0]
    title = row['title']
    description = f" id: {idx} \n Movie Title: {title} \n Genres: {row['genres']} \n Tags: {row['top_20_tags']}"
    movies_dict[str(idx)] = {'description': description}
    

In [None]:
with open('data/ml25M_100_movie_sample_seed_10.json', 'w') as json_file:
    json.dump(movies_dict, json_file, indent=4)

### Create movie name to ID mapping

In [None]:
# Create title to ID mapping
title_map = {}
for i, title in enumerate(titles_ordered_initial_100):
    title_map[title] = str(i)

In [None]:
with open('data/name_maps/ml25M_100_map.json', 'w') as json_file:
    json.dump(title_map, json_file, indent=4)

### Trim 100 movies to 50

In [None]:
with open('data/ml25M_100_movie_sample_seed_10.json', 'r') as json_file:
    jdata = json.load(json_file)

In [None]:
keep_ids = []
for val in range(50):
    keep_ids.append(str(val))

In [None]:
final_dict = {key: jdata[key] for key in keep_ids}
final_dict

In [None]:
import json

with open('data/ml25M_50_movie_sample_seed_10.json', 'w') as json_file:
    json.dump(final_dict, json_file, indent=4)

### Randomly select 50 movies and get their top 20 most frequent tags

In [None]:
import random

In [None]:
random.seed(42)
random_50_movie_ids = random.sample(movies_10_plus_tags, 50)

In [None]:
#Filter 'tags_df' for these 50 movies
filtered_50_tags_df = tags_df[tags_df['movieId'].isin(random_50_movie_ids)]

#tag counts for each movie:
grouped_tags = filtered_50_tags_df.groupby(['movieId', 'tag']).size().reset_index(name='count')

# for each movie, select the top 20 tags
top_20_tags_per_movie = grouped_tags.groupby('movieId').apply(lambda x: x.nlargest(20, 'count')).reset_index(drop = True)

In [None]:
top_20_tags_per_movie

In [None]:
# Create a DataFrame from the 'top_20_tags_per_movie' with 'movieId' as index and concatenated tags as values
tags_concatenated = top_20_tags_per_movie.groupby('movieId')['tag'].apply(lambda x: ', '.join(x)).reset_index(name='top_20_tags')

#Merge the concatenated tags with the filtered_movie_df
filtered_movie_df = movie_df[movie_df['movieId'].isin(random_50_movie_ids)]
filtered_50_df = pd.merge(filtered_movie_df, tags_concatenated, on='movieId', how='left')

#randomize rows
filtered_50_df = filtered_50_df.sample(frac=1).reset_index(drop=True)

In [None]:
filtered_50_df

### Make JSON File

In [None]:
movies_dict = {}

for idx, row in filtered_50_df.iterrows():
    description = f" -Movie Title: {row['title']} \n -Genres: {row['genres']} \n -Tags: {row['top_20_tags']}"
    movies_dict[str(idx)] = {'description': description}

In [None]:
import json

with open('data/ml25M_50_movie_sample.json', 'w') as json_file:
    json.dump(movies_dict, json_file, indent=4)

### Randomly select 16 movies and get their top 20 most frequent tags
These are a subset of the above 100 at seed 42

In [None]:
import random

In [None]:
random.seed(42)
random_16_movie_ids = random.sample(movies_10_plus_tags, 16)

In [None]:
#Filter 'tags_df' for these 100 movies
filtered_16_tags_df = tags_df[tags_df['movieId'].isin(random_16_movie_ids)]

#tag counts for each movie:
grouped_tags = filtered_16_tags_df.groupby(['movieId', 'tag']).size().reset_index(name='count')

# for each movie, select the top 20 tags
top_20_tags_per_movie = grouped_tags.groupby('movieId').apply(lambda x: x.nlargest(20, 'count')).reset_index(drop = True)

In [None]:
top_20_tags_per_movie

In [None]:
# Create a DataFrame from the 'top_20_tags_per_movie' with 'movieId' as index and concatenated tags as values
tags_concatenated = top_20_tags_per_movie.groupby('movieId')['tag'].apply(lambda x: ', '.join(x)).reset_index(name='top_20_tags')

#Merge the concatenated tags with the filtered_movie_df
filtered_movie_df = movie_df[movie_df['movieId'].isin(random_16_movie_ids)]
filtered_16_df = pd.merge(filtered_movie_df, tags_concatenated, on='movieId', how='left')

#randomize rows
filtered_16_df = filtered_16_df.sample(frac=1).reset_index(drop=True)

In [None]:
filtered_16_df

### Make JSON File

In [None]:
movies_dict = {}

for idx, row in filtered_16_df.iterrows():
    description = f" -Movie Title: {row['title']} \n -Genres: {row['genres']} \n -Tags: {row['top_20_tags']}"
    movies_dict[str(idx)] = {'description': description}

In [None]:
import json

with open('data/ml25M_16_movie_sample.json', 'w') as json_file:
    json.dump(movies_dict, json_file, indent=4)

### Few shot samples
Some non-test movies with tags for few shot purposes.

In [None]:
random.seed(100000)
random_50_movie_ids = random.sample(movies_10_plus_tags, 50)

In [None]:
#remove overlapping ids with test data
random_50_movie_ids = [id for id in random_50_movie_ids if id not in random_100_movie_ids]

In [None]:
#Filter 'tags_df' for these 100 movies
filtered_50_tags_df = tags_df[tags_df['movieId'].isin(random_50_movie_ids)]

#tag counts for each movie:
grouped_tags = filtered_50_tags_df.groupby(['movieId', 'tag']).size().reset_index(name='count')

# for each movie, select the top 20 tags
top_20_tags_per_movie = grouped_tags.groupby('movieId').apply(lambda x: x.nlargest(20, 'count')).reset_index(drop = True)

In [None]:
top_20_tags_per_movie

In [None]:
# Create a DataFrame from the 'top_20_tags_per_movie' with 'movieId' as index and concatenated tags as values
tags_concatenated = top_20_tags_per_movie.groupby('movieId')['tag'].apply(lambda x: ', '.join(x)).reset_index(name='top_20_tags')

#Merge the concatenated tags with the filtered_movie_df
filtered_movie_df = movie_df[movie_df['movieId'].isin(random_50_movie_ids)]
filtered_50_df = pd.merge(filtered_movie_df, tags_concatenated, on='movieId', how='left')

#randomize rows
filtered_50_df = filtered_50_df.sample(frac=1).reset_index(drop=True)

In [None]:
filtered_50_df

### Make JSON File

In [None]:
movies_dict = {}

for idx, row in filtered_50_df.iterrows():
    description = f" -Movie Title: {row['title']} \n -Genres: {row['genres']} \n -Tags: {row['top_20_tags']}"
    movies_dict[str(idx)] = {'description': description}

In [None]:
import json

with open('data/ml25M_FS_movie_sample.json', 'w') as json_file:
    json.dump(movies_dict, json_file, indent=4)

### Old code

In [None]:
# USE THIS OPTION IF YOU WANT A RANDOM SAMPLE OF MOVIES WITH A SPECIFIC NUMBER OF TAGS

# Select 20 random movies from the trimmed set
np.random.seed(42)
random_movies = np.random.choice(trimmed_tags['movieId'].unique(), size=20)
movie_subset = random_movies


In [None]:
# USE THIS OPTION FOR HANDSELECTED MOVIES
selected_titles = [
    "Terminator, The (1984)", # 4 sci-fi movies
    "Matrix, The (1999)",
    "Interstellar (2014)",
    "Alien (1979)",
    "Tropic Thunder (2008)", # 4 comedy movies
    "Night at the Museum (2006)",
    "Shaun of the Dead (2004)",
    "Hot Fuzz (2007)",
    "Godfather, The (1972)", # 4 crime movies
    "Heat (1995)",
    "Goodfellas (1990)",
    "Reservoir Dogs (1992)",
    "Fantasia (1940)", # 4 animated movies
    "Toy Story 2 (1999)",
    "Aladdin (1992)",
    "Shrek (2001)",
]
movie_rows = movie_df[movie_df['title'].isin(selected_titles)]
movie_rows = movie_rows.drop_duplicates(subset=['title'])
movie_subset = movie_rows['movieId']
movie_rows

In [None]:
final_tags = tags_df[tags_df['movieId'].isin(movie_subset)]
final_tags.head()

In [None]:
test_row = final_tags[final_tags['movieId'] == 1666]
for tag in test_row['tag']:
    print(tag)

In [None]:
item_descs = {}
for item_count, item_id in enumerate(movie_subset):
    movie_df_row = movie_df[movie_df['movieId'] == item_id]
    # Set description as title
    title = movie_df_row['title'].item()
    item_desc = title + ", "
    # Append genres to the description
    genres = movie_df_row['genres'].item()
    genres = genres.split("|")
    for genre in genres:
        item_desc += (genre + ", ")

    # Append tags to the description
    movie_tags = final_tags[final_tags['movieId'] == item_id].drop_duplicates("tag")
    for i, aspect in enumerate(movie_tags['tag']):
        item_desc += aspect
        if (i > 15): # Cap the number of aspects at 15
            break
        if not (i == (len(movie_tags) - 1)):
            item_desc += ", "
    item_descs[str(item_count)] = {"description": item_desc, "name": title} # Using item_count instead of item_id 
print(item_descs)


In [None]:
import json

with open("./data/movielens_16_trimmed.json", "w") as output_file:
    json.dump(item_descs, output_file)