In [5]:
import sys
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer


# Data import

In [3]:
movies_df = pd.read_csv('../../data/movies.csv')
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


# Data preprocessing

We need to separate the genres into binary-encoded data so we can use them to categorize the movies and recommend similar movies.

In [8]:
# Split the pipe-separated genres into a list of genres
movies_df['genres_split'] = movies_df['genres'].str.split('|')

# Binary encode the data
mlb = MultiLabelBinarizer()
genres_encoded = mlb.fit_transform(movies_df['genres_split'])
genres_encoded_df = pd.DataFrame(genres_encoded, columns)
movies_with_genres = pd.concat([movies_df, genres_encoded_df], axis=1)
movies_with_genres = movies_with_genres.drop(columns=['genres_split', 'genres'])

movies_with_genres.head()

Unnamed: 0,movieId,title,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# One-hot encode the genres
genres_encoded = movies['genres'].str.get_dummies(sep='|')
genres_encoded = pd.concat([movies[['movieId']], genres_encoded], axis=1)

genres_encoded