In [1]:
import pandas as pd

base_movies = pd.read_csv(
    "../ml-1m/movies.dat",
    sep="::",
    engine="python",
    names=["movie_id", "title", "genres"],
    encoding="latin-1"
)

base_movies.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [2]:
base_movies["year"] = base_movies["title"].str.extract(r"\((\d{4})\)").astype("Int64")
base_movies["title"] = base_movies["title"].str.replace(r"\s*\(\d{4}\)$", "", regex=True)

base_movies.head()

Unnamed: 0,movie_id,title,genres,year
0,1,Toy Story,Animation|Children's|Comedy,1995
1,2,Jumanji,Adventure|Children's|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama,1995
4,5,Father of the Bride Part II,Comedy,1995


In [3]:
all_genres = [
    "Action", "Adventure", "Animation", "Children's", "Comedy",
    "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir",
    "Horror", "Musical", "Mystery", "Romance", "Sci-Fi",
    "Thriller", "War", "Western"
]

base_movies["genres"] = base_movies["genres"].str.split("|")
base_movies.head()

Unnamed: 0,movie_id,title,genres,year
0,1,Toy Story,"[Animation, Children's, Comedy]",1995
1,2,Jumanji,"[Adventure, Children's, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama]",1995
4,5,Father of the Bride Part II,[Comedy],1995


In [4]:
genres = pd.DataFrame({
    "genre_id": range(1, len(all_genres) + 1),
    "name": all_genres,
})

genres.to_csv("../genres.csv", index=False)

In [5]:
genres.head(20)

Unnamed: 0,genre_id,name
0,1,Action
1,2,Adventure
2,3,Animation
3,4,Children's
4,5,Comedy
5,6,Crime
6,7,Documentary
7,8,Drama
8,9,Fantasy
9,10,Film-Noir


In [6]:
rows = []
for _, row in base_movies.iterrows():
    movie_id = row["movie_id"]
    for g in row["genres"]:
        genre_id = genres.loc[genres["name"] == g, "genre_id"].iloc[0]
        rows.append({"movie_id": movie_id, "genre_id": genre_id})

movie_genres = pd.DataFrame(rows)
movie_genres.to_csv("../movie_genres.csv", index=False)

In [7]:
movie_genres.head()

Unnamed: 0,movie_id,genre_id
0,1,3
1,1,4
2,1,5
3,2,2
4,2,4


In [8]:
base_movies = base_movies.drop(columns=["genres"])
base_movies.head()

Unnamed: 0,movie_id,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995
