# Cleaning and Preparing data

**https://pandas.pydata.org/docs/user_guide/index.html#user-guide**

In [59]:
import pandas as pd
import os

In [63]:
name_basics = pd.read_csv("name.basics.tsv", sep='\t', low_memory=False)
title_basics = pd.read_csv("title.basics.tsv", sep='\t', low_memory=False)
title_ratings = pd.read_csv("title.ratings.tsv", sep='\t', low_memory=False)

In [65]:
print(f"name_basics: {name_basics.shape}")
print(f"title_basics: {title_basics.shape}")
print(f"title_ratings: {title_ratings.shape}")

name_basics: (14270591, 6)
title_basics: (11530944, 9)
title_ratings: (1547472, 3)


In [68]:
movies_df = title_basics[
    ["tconst", "titleType", "primaryTitle", "startYear", "genres"]
]

In [70]:
ratings_df = title_ratings[["tconst", "averageRating", "numVotes"]]

In [72]:
merged_movies = movies_df.merge(ratings_df, on="tconst", how="inner")

In [74]:
merged_movies = merged_movies[merged_movies["titleType"] == "movie"]
print(merged_movies.shape)

(327156, 7)


In [76]:
merged_movies = merged_movies.drop_duplicates(subset="primaryTitle")
merged_movies = merged_movies.dropna(subset=["averageRating", "numVotes", "startYear"])
merged_movies = merged_movies[merged_movies["startYear"].astype(str).str.isnumeric()]
merged_movies = merged_movies[merged_movies["startYear"].astype(int) >= 1980]

In [78]:
merged_movies = merged_movies.sort_values(by=["numVotes", "averageRating"], ascending=False).head(50000)

In [80]:
print(merged_movies["primaryTitle"].head(20))

84687                              The Shawshank Redemption
259863                                      The Dark Knight
682984                                            Inception
101600                                           Fight Club
83581                                          Forrest Gump
84468                                          Pulp Fiction
412656                                         Interstellar
99331                                            The Matrix
92529     The Lord of the Rings: The Fellowship of the Ring
117775        The Lord of the Rings: The Return of the King
666963                                The Dark Knight Rises
87261                                                 Se7en
117776                The Lord of the Rings: The Two Towers
863093                                     Django Unchained
212839                                 Inglourious Basterds
218147                                        Batman Begins
77786                              The S

In [82]:
output_file = "cleaned_imdb_movies.csv"
merged_movies.to_csv(output_file, index=False)

In [84]:
os.makedirs("models", exist_ok=True)