# Data Correction

In [2]:
import pandas as pd
from scrapers.drivers import Requester

Base Movielens dataset

In [3]:
links = pd.read_csv("./database/ml-32m/links.csv").rename(
    {"movieId": "MovieID", "imdbId": "ImdbID", "tmdbId": "TmdbID"}, axis=1
)
movies = pd.read_csv("./database/ml-32m/movies.csv").rename(
    {"movieId": "MovieID", "title": "Title", "genres": "Genres"}, axis=1
)

#### Problem 1: Missing imdb entries

Some imdb entries is missing due to outdated links or removed.

In [4]:
imdb = pd.read_csv("./database/imdb/movie_entries.csv")

In [23]:
nan_imdb = imdb[imdb["Directors"].isna()]["ImdbID"].values
nan_id = links[links["ImdbID"].isin(nan_imdb)]["MovieID"]
links_and_title = pd.merge(links, movies, on="MovieID", how="left")
missing_imdb = links_and_title[links_and_title["MovieID"].isin(nan_id)]
missing_imdb

Unnamed: 0,MovieID,ImdbID,TmdbID,Title,Genres
705,720,118114,503475.0,Wallace & Gromit: The Best of Aardman Animatio...,Adventure|Animation|Comedy
16391,86668,1347439,241620.0,Louis Theroux: Law & Disorder (2008),Documentary
30274,135539,3416042,335676.0,Aurora (2015),(no genres listed)
73230,237946,10141686,632378.0,American Bullet (2019),Action|Adventure
75263,247468,12446902,811303.0,Audition (2021),(no genres listed)
86271,288099,22811298,1061605.0,Don't Look Deeper (2022),Drama|Sci-Fi


Solution: Manual search

In [46]:
filter_missing = imdb[~imdb["ImdbID"].isin(missing_imdb["ImdbID"].values)]
filter_missing.to_csv("./database/imdb/movie_entries.csv", index=False)

In [None]:
imdb_correction_map = { # MovieID, new ImdbID
    720: 31416047,      # Wallace & Gromit: The Best of Aardman Animatio...
    86668: 2585192,     # Louis Theroux: Law & Disorder (2008)	
    135539: 3411580,    # Aurora (2015)
    288099: 10488234,   # Don't Look Deeper (2022)
}
imdb_removed_entries = [237946, 247468] # Also removed on tmdb

Replace wrong ImdbID

In [52]:
def correct(row):
    movie_id, imdb, tmdb = row
    movie_id = int(movie_id)
    if movie_id in imdb_correction_map:
        imdb = imdb_correction_map[movie_id]
    return pd.Series([movie_id, imdb, tmdb], ["movieId", "imdbId", "tmdbId"])

new_links = (links
 .apply(correct, axis=1)
 .astype({"movieId": "int64", "imdbId": "int64"}))
new_links.to_csv("../database/ml-32m/links.csv", index=False)

Add removed entries

In [None]:
# ['ImdbID', 'Runtime', 'ReleaseDate', 'Rating', 'VoteCount', 'Directors', 'Cast', 'OriginCountries', 'Languages', 'Genres', 'Plot']
custom_imdb_entries = [
    # American Bullet (2019)
    [
        10141686,
        "01:32:00",
        "2019",
        None,
        None,
        ["Brando Benetton", "Marielle Woods"],
        ["Dylan Baker", "Spencer Treat"],
        ["United States", "Italy"],
        ["English"],
        ["Action", "War"],
        "A collection of short war tales from young, future storytellers. Each story tackles subjects like war, honor, sacrifice and humanity.",
    ],
    # Audition (2021), source: www.amazon.co.uk
    [
        12446902,
        "01:15:00",
        "2021",
        "9",
        "27",
        ["Edward Whelan"],
        ["Romilly Carboni", "Rose Galbraith", "Jack Firoozan", "Barney Mercer"],
        ["United Kingdom"],
        ["English"],
        ["Drama", "Romance"],
        "Alex, a wide-eyed, ambitious student auditions for a role that could change his life, but to succeed, he must impress two draconian judges at any cost.",
    ],
]
new_imdb_entries = pd.DataFrame(custom_imdb_entries, columns=imdb.columns)

In [63]:
imdb = pd.read_csv("../database/imdb/movie_entries.csv")
new_imdb = pd.concat([imdb, new_imdb_entries])
new_imdb = new_imdb.drop_duplicates(subset="ImdbID")
new_imdb.to_csv("../database/imdb/movie_entries.csv", index=False)

#### Problem 2: Missing tmdb entries

Some tmdb links are missing or not exist. Use TMDB api to try to get the rest.  


In [4]:
converter = Requester()
missing = links[links["TmdbID"].isna()]
missing_count = len(missing)
for i, idx in enumerate(missing.index):
    imdb_id = int(links.iloc[idx]["ImdbID"])
    url = (f"https://api.themoviedb.org/3/find/tt{imdb_id:07d}?external_source=imdb_id")
    results = converter.get(url)["movie_results"]
    if results:
        tmdb_id = results[0]["id"]
    else:
        tmdb_id = None
    print(f"{i + 1}/{missing_count}: {imdb_id} --> {tmdb_id}")
    links.loc[int(idx), "TmdbID"] = tmdb_id
links.to_csv("../database/ml-32m/links.csv", index=False)

1/52: 125877 --> None
2/52: 38426 --> None
3/52: 102336 --> None
4/52: 113212 --> 1397138
5/52: 123953 --> None
6/52: 120881 --> None
7/52: 133090 --> None
8/52: 133361 --> None
9/52: 87690 --> None
10/52: 341315 --> None
11/52: 81454 --> None
12/52: 56600 --> None
13/52: 223249 --> None
14/52: 348862 --> None
15/52: 142032 --> None
16/52: 377059 --> None
17/52: 1141486 --> None
18/52: 1517595 --> None
19/52: 1104746 --> None
20/52: 1433856 --> None
21/52: 1594971 --> None
22/52: 874957 --> None
23/52: 1103248 --> None
24/52: 466214 --> None
25/52: 495705 --> None
26/52: 1590050 --> None
27/52: 199208 --> None
28/52: 1753881 --> None
29/52: 1759871 --> None
30/52: 1226036 --> None
31/52: 901206 --> None
32/52: 159542 --> None
33/52: 368520 --> None
34/52: 2109106 --> None
35/52: 2327631 --> None
36/52: 1706542 --> None
37/52: 1337601 --> None
38/52: 2380408 --> None
39/52: 2060305 --> None
40/52: 2973064 --> None
41/52: 2510620 --> None
42/52: 1647292 --> None
43/52: 401019 --> None
44