# Data Correction

In [2]:
import os, sys
sys.path.append(os.path.abspath(".."))

In [56]:
import pandas as pd
import numpy as np
from scrapers.drivers import Requester

Base Movielens dataset

In [57]:
links = pd.read_csv("../database/ml-32m/links.csv").rename(
    {"movieId": "MovieID", "imdbId": "ImdbID", "tmdbId": "TmdbID"}, axis=1
)
movies = pd.read_csv("../database/ml-32m/movies.csv").rename(
    {"movieId": "MovieID", "title": "Title", "genres": "Genres"}, axis=1
)

#### Problem 1: Missing imdb entries

Some imdb entries is missing due to outdated links or removed.

In [6]:
imdb = pd.read_csv("../database/imdb/movie_entries.csv")

In [23]:
nan_imdb = imdb[imdb["Directors"].isna()]["ImdbID"].values
nan_id = links[links["ImdbID"].isin(nan_imdb)]["MovieID"]
links_and_title = pd.merge(links, movies, on="MovieID", how="left")
missing_imdb = links_and_title[links_and_title["MovieID"].isin(nan_id)]
missing_imdb

Unnamed: 0,MovieID,ImdbID,TmdbID,Title,Genres
705,720,118114,503475.0,Wallace & Gromit: The Best of Aardman Animatio...,Adventure|Animation|Comedy
16391,86668,1347439,241620.0,Louis Theroux: Law & Disorder (2008),Documentary
30274,135539,3416042,335676.0,Aurora (2015),(no genres listed)
73230,237946,10141686,632378.0,American Bullet (2019),Action|Adventure
75263,247468,12446902,811303.0,Audition (2021),(no genres listed)
86271,288099,22811298,1061605.0,Don't Look Deeper (2022),Drama|Sci-Fi


Solution: Manual search

In [46]:
filter_missing = imdb[~imdb["ImdbID"].isin(missing_imdb["ImdbID"].values)]
filter_missing.to_csv("../database/imdb/movie_entries.csv", index=False)

In [None]:
imdb_correction_map = { # MovieID, new ImdbID
    720: 31416047,      # Wallace & Gromit: The Best of Aardman Animatio...
    86668: 2585192,     # Louis Theroux: Law & Disorder (2008)	
    135539: 3411580,    # Aurora (2015)
    288099: 10488234,   # Don't Look Deeper (2022)
}
imdb_removed_entries = [237946, 247468] # Also removed on tmdb

Replace wrong ImdbID

In [52]:
def correct(row):
    movie_id, imdb, tmdb = row
    movie_id = int(movie_id)
    if movie_id in imdb_correction_map:
        imdb = imdb_correction_map[movie_id]
    return pd.Series([movie_id, imdb, tmdb], ["movieId", "imdbId", "tmdbId"])

new_links = (links
 .apply(correct, axis=1)
 .astype({"movieId": "int64", "imdbId": "int64"}))
new_links.to_csv("../database/ml-32m/links.csv", index=False)

Add removed entries

In [None]:
# ['ImdbID', 'Runtime', 'ReleaseDate', 'Rating', 'VoteCount', 'Directors', 'Cast', 'OriginCountries', 'Languages', 'Genres', 'Plot']
custom_imdb_entries = [
    # American Bullet (2019)
    [
        10141686,
        "01:32:00",
        "2019",
        None,
        None,
        ["Brando Benetton", "Marielle Woods"],
        ["Dylan Baker", "Spencer Treat"],
        ["United States", "Italy"],
        ["English"],
        ["Action", "War"],
        "A collection of short war tales from young, future storytellers. Each story tackles subjects like war, honor, sacrifice and humanity.",
    ],
    # Audition (2021), source: www.amazon.co.uk
    [
        12446902,
        "01:15:00",
        "2021",
        "9",
        "27",
        ["Edward Whelan"],
        ["Romilly Carboni", "Rose Galbraith", "Jack Firoozan", "Barney Mercer"],
        ["United Kingdom"],
        ["English"],
        ["Drama", "Romance"],
        "Alex, a wide-eyed, ambitious student auditions for a role that could change his life, but to succeed, he must impress two draconian judges at any cost.",
    ],
]
new_imdb_entries = pd.DataFrame(custom_imdb_entries, columns=imdb.columns)

In [63]:
imdb = pd.read_csv("../database/imdb/movie_entries.csv")
new_imdb = pd.concat([imdb, new_imdb_entries])
new_imdb = new_imdb.drop_duplicates(subset="ImdbID")
new_imdb.to_csv("../database/imdb/movie_entries.csv", index=False)

#### Problem 2: Missing tmdb entries

Some tmdb links are missing or not exist. Use TMDB api to try to get the rest.  
See implementation in [imdb_to_tmdb.py](../imdb_to_tmdb.py).


```python
    converter = Requester()
    links = pd.read_csv("./database/ml-32m/links.csv")
    missing = links[links["tmdbId"].isna()]
    missing_count = len(missing)
    for i, idx in enumerate(missing.index):
        imdb_id = int(links.iloc[idx]["imdbId"])
        url = (
            f"https://api.themoviedb.org/3/find/tt{imdb_id:07d}?external_source=imdb_id"
        )
        results = converter.get(url)["movie_results"]
        if results:
            tmdb_id = results[0]["id"]
        else:
            tmdb_id = None
        print(f"{i + 1}/{missing_count}: {imdb_id} --> {tmdb_id}")
        links.loc[int(idx), "tmdbId"] = tmdb_id
    links.to_csv("./database/ml-32m/links.csv", index=False)
```
