In [2]:
# Imports
import pandas as pd

# Load Movie Dataset
movies = pd.read_csv("./data/movies_metadata.csv", usecols=[5, 9, 20, 22, 23])

# Drop null values
movies = movies.dropna()

# Drop rows where overview is "No overview found." or "No overview"
movies = movies[(movies["overview"] != "No overview found.") & (movies["overview"] != "No overview")]

# Keep only movies with more than 10 votes (50% interval) and vote_average above or equal to 6
movies = movies[(movies["vote_count"] > 10) & (movies["vote_average"] >= 6)]

# Remove duplicate titles with lowest vote average
mask = movies.duplicated(subset=["title"], keep=False)
movies = movies[~(mask & (movies["vote_average"] == movies[mask]["vote_average"].min()))]

# Remove strictly identical remaining duplicated titles
movies = movies.drop_duplicates(subset="title")

# Sort the dataframe in alphabetical order
movies.sort_values("title", inplace=True)

# Assign indexes
movies["index"] = [i for i in range(0, len(movies))]

# Save file
movies.to_csv("./data/movies_metadata_preprocessed.csv")

movies

Unnamed: 0,id,overview,title,vote_average,vote_count,index
15775,4204,"After being released from jail, the son of a c...",$5 a Day,6.0,24.0,0
36186,248268,When Ross is diagnosed with terminal brain can...,$50K and a Call Girl: A Love Story,6.3,11.0,1
14832,19311,"Have you ever wondered ""What is the meaning of...",$9.99,6.0,28.0,2
25192,252178,A young British soldier must find his way back...,'71,6.7,414.0,3
35374,362886,The holiday drama follows an out-of-work actre...,'Tis the Season for Love,6.0,11.0,4
...,...,...,...,...,...,...
10026,43344,"Francisco is rich, rather strict on principles...",Él,7.8,18.0,13084
31125,4645,Paul Winkelmann (Loriot) is the CEO of a succe...,Ödipussi,6.7,36.0,13085
28180,22974,This film tells the story of a few uneventful ...,Üvegtigris,8.5,12.0,13086
9739,19765,A nurse and her surgeon-lover are part of a re...,Želary,7.3,12.0,13087
