In [None]:
# default_exp extract

# Data Extraction and Cleaning

> The TMDB dataset is only accessible through an API. Hence, we will write a few functions to extract the dataset, perform some simple preprocessing before saving to csv files.

In [2]:
#hide
from nbdev.showdoc import *
from fastcore.test import *

We first extract the dataset by accessing the TMDB API. Since not all requests will be successful, a larger than desired dataset size value should be used to return an eventual dataset size of roughly the same magnitude. 

In [3]:
#exporti

from pandas import DataFrame
from sklearn.utils import shuffle
import os
import concurrent
import requests
import pandas as pd

def extract_dataset(api_key: str, n: int):
    req_url = f"https://api.themoviedb.org/3/movie/{n}?api_key={api_key}&language=en-US"
    response = requests.get(req_url)
    if response.status_code == 200:
        array = response.json()
        return array
    return

In [4]:
#export

def extract_dataset_threaded(api_key: str, 
                             max_ds_size: int, 
                             max_threads: int) -> DataFrame:
    
    max_threads = max_threads if max_threads < max_ds_size else max_ds_size
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
        movies = [ex for n in range(max_ds_size) if (ex := executor.submit(extract_dataset, api_key, n).result()) is not None]
    return pd.DataFrame.from_records(movies)

Let's run the data extraction functions. 

In [9]:
#hide
api_key = "e3fa4fd3a7d10b5774d8b999a7812734"
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', 6)
pd.set_option('display.width', None)

In [13]:
api_key = "e3fa4fd3a7d10b5774d8b999a7812734"
max_ds_size = 200 # 2000
max_threads = 10 # 1000
movies = extract_dataset_threaded(api_key=api_key,
                                  max_ds_size=max_ds_size,
                                  max_threads=max_threads)
print(f"Total number of examples downloaded is: {len(movies)}")
movies.head()

Total number of examples downloaded is: 162


Unnamed: 0,adult,backdrop_path,belongs_to_collection,...,video,vote_average,vote_count
0,False,/hQ4pYsIbP22TMXOUdSfC2mjWrO0.jpg,,...,False,6.8,128
1,False,/l94l89eMmFKh7na2a1u5q67VgNx.jpg,,...,False,7.2,126
2,False,/u0zMKKpEdDWpOKmFW2sLbKKICJH.jpg,,...,False,5.7,1910
3,False,/5aXp2s4l6g5PcMMesIj63mx8hmJ.jpg,,...,False,6.5,184
4,False,,,...,False,7.4,14


In [20]:
movies["tagline"].head()

0                                                     
1                                                     
2    Twelve outrageous guests. Four scandalous requ...
3       Don't move. Don't whisper. Don't even breathe.
4                                  A Megacities remix.
Name: tagline, dtype: object

Looking at the first 5 rows of the scraped dataset, we easily see that some example taglines (under the tagline column) are missing. We'll need to separate these examples into a separate csv file as they do not have labels. Additionally, we can create our train, validation and test datasets concurrently and save them into separate csv files. This helps reproducibility later on. We also print the relative proportions of each dataset to see if we will need to redo the extraction process above. 

In [14]:
#export

def create_splits(df: DataFrame,
                  label: str,
                  splits: list,
                  seed: int,
                  keep_missing: bool,
                  save_path: str = "."):

    assert len(splits) == 2, "Train, validation and test splits must be provided, please provide 2 of them as fractions."
    if keep_missing:
        unlabelled_df = df[df[label] == '']
        unlabelled_df.to_csv(os.path.join(save_path, "tagless.csv"))
        print(f"Tagless set size: {len(unlabelled_df)}")
        print("Tagless dataset created!")
    labelled_df = df[df[label] != '']
    df_size = len(labelled_df)
    labelled_df = shuffle(labelled_df, random_state=seed)
    labelled_df.reset_index(drop=True, inplace=True)
    valid_start, test_start = int(df_size*splits[0]), int(df_size*splits[0] + df_size*splits[1])
    train_df = labelled_df.iloc[:valid_start]
    valid_df = labelled_df.iloc[valid_start:test_start]
    test_df = labelled_df[test_start:]
    print(f"Train set size: {len(train_df)}\nValid set size: {len(valid_df)}\nTest set size: {len(test_df)}")
    train_df.to_csv(os.path.join(save_path, "train.csv"))
    valid_df.to_csv(os.path.join(save_path, "valid.csv"))
    test_df.to_csv(os.path.join(save_path, "test.csv"))
    print("Train, Validation and Test datasets created!")

In [15]:
splits = [0.7, 0.15]
label = "tagline"
seed = 42
create_splits(df=movies,
              label=label,
              splits=splits,
              seed=seed,
              keep_missing=True)

Tagless set size: 36
Tagless dataset created!
Train set size: 88
Valid set size: 19
Test set size: 19
Train, Validation and Test datasets created!
