In [None]:
#hide

%load_ext autoreload
%autoreload 2

In [None]:
# default_exp clean

# Data Cleaning and Feature Engineering
> Cleaning and feature engineering based on the insights gained from the previous step on EDA

In [None]:
#hide
from nbdev.showdoc import *
from fastcore.test import *

In [None]:
#exporti
from pandas import DataFrame
from sklearn.utils import shuffle
from sklearn.preprocessing import MultiLabelBinarizer
from DSAI_proj.extract import *
from DSAI_proj.eda import *
import os
import pandas as pd
import json
import concurrent
import requests
import pandas as pd
import numpy as np

In [None]:
#hide 
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', 8)
pd.set_option('display.width', None)

In [None]:
movies = pd.read_csv('raw_data.csv')

The first step will be to process the categorical variables of the dataset. In this case, we have genres as a categorical variable. We will use the MultiLabelBinarizer from scikit-learn to one-hot-encode the movie genres. This will create additional columns in our DataFrame, each corresponding to a separate genre type. We will reuse the clean_genre function used in the EDA section. 

In [None]:
df = clean_genre(df=movies)
df.head()

Unnamed: 0.1,Unnamed: 0,adult,backdrop_path,belongs_to_collection,...,TV Movie,Thriller,War,Western
0,0,False,/hQ4pYsIbP22TMXOUdSfC2mjWrO0.jpg,,...,0,0,0,0
1,1,False,/l94l89eMmFKh7na2a1u5q67VgNx.jpg,,...,0,0,0,0
2,2,False,/u0zMKKpEdDWpOKmFW2sLbKKICJH.jpg,,...,0,0,0,0
3,3,False,/5aXp2s4l6g5PcMMesIj63mx8hmJ.jpg,,...,0,1,0,0
4,4,False,,,...,0,0,0,0


Next, we will need to extract the images into a separate directory. This will allow for easier access during the training stage. The following codes help us to extract the backdrop and poster images from their urls to separate directories. We can multithread this function as well as it involves multiple IO operations. Additionally, as some examples will not have images, we will drop these examples.

In [None]:
#exporti

def req_image(url: str, save_path: str, id_num: int):
    req_url = f"https://image.tmdb.org/t/p/original{url}"
    response = requests.get(req_url)
    if response.status_code == 200:
        fname = os.path.join(save_path, f"{id_num}.jpg")
        with open(fname, "wb") as f:
            f.write(response.content)
            return None
    return id_num

In [None]:
#export

def extract_images_threaded(df: DataFrame,
                            cur_path: str,
                            img_type: list,
                            max_threads: int) -> tuple:
    max_threads = max_threads if max_threads < len(df) else len(df)
    problem_ids = []
    for itype in img_type:
        save_path = os.path.join(cur_path, f"{itype}_img")
        os.makedirs(save_path, exist_ok=True)
        
        with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
            pids = [id_num for cnt, url in enumerate(df[f"{itype}_path"]) if (id_num := executor.submit(req_image, url, save_path, df.iloc[cnt]['id']).result()) is not None]
        problem_ids.extend(pids)
        print(f"{itype} images written successfully!")
    problem_ids = set(problem_ids)
    df = df[~df['id'].isin(problem_ids)]
    return df, problem_ids

In [None]:
df, problem_ids = extract_images_threaded(df=df, cur_path=".", img_type=["poster", "backdrop"], max_threads=1000)
print(f"Number of rows dropped due to missing images: {len(problem_ids)}")

Number of rows dropped due to missing images: 1118


Lastly, from the previous notebook on EDA, we have already identified the relevant and irrelevant features required for our tagline prediction task. We will now drop the columns or features that are irrelevant. We can also include the image url paths to be dropped as we have already extracted the necessary images into a separate folder. 

Additionally, we also feature engineer based on the release dates to split that column into year, month and day separately. This generally allows models to process such meta information better. We also bin the year and day as per the EDA; we will leave the months as it is as the values are generally small and in the range of the other variables.

Speaking of dropped rows, let's also drop the rows which are missing information in the important columns identified during EDA.

In [None]:
#exporti

def drop_missing(df: DataFrame, cols: list) -> DataFrame:
    df.dropna(subset=cols, inplace=True)
    return df

def split_datetime(df: DataFrame,
                   date_col: str) -> DataFrame:
    df[date_col] = pd.to_datetime(df[date_col], format='%Y-%m-%d')
    df[f"{date_col}_year"] = df[date_col].dt.year
    df[f"{date_col}_month"] = df[date_col].dt.month
    df[f"{date_col}_day"] = df[date_col].dt.day
    df.drop(date_col, inplace=True, axis=1)
    
    day_bins = [0, 10, 20, 31]
    day_labels = [1, 2, 3]
    year_bins = [1900, 1940, 1960, 1980, 2000, 2020]
    year_labels = [1, 2, 3, 4, 5]
    df[f"{date_col}_day"]  = pd.cut(df[f"{date_col}_day"], bins=day_bins, labels=day_labels)
    df[f"{date_col}_year"] = pd.cut(df[f"{date_col}_year"], bins=year_bins, labels=year_labels)
    return df

In [None]:
#export

def drop_col(data: DataFrame,
             irrelevant_cols: list,
             relevant_cols: list) -> DataFrame:
    df = data.drop(irrelevant_cols, axis = 1)
    df = drop_missing(df=df, cols=relevant_cols)
    df = split_datetime(df=df, date_col="release_date")
    df = df.drop(df.columns[0], axis=1)
    return df

In [None]:
irrelevant_columns = ['adult', 'belongs_to_collection','homepage','imdb_id','production_companies','popularity','original_language','original_title','revenue','runtime','spoken_languages','status','video','vote_average','vote_count','production_countries','budget', 'poster_path', 'backdrop_path']
relevant_columns = list(set(df.columns) - set(irrelevant_columns))
df = drop_col(data=df, irrelevant_cols=irrelevant_columns, relevant_cols=relevant_columns)
df.head()

Unnamed: 0,adult,id,overview,tagline,...,Western,release_date_year,release_date_month,release_date_day
2,False,5,It's Ted the Bellhop's first night on the job....,Twelve outrageous guests. Four scandalous requ...,...,0,4,12,1
3,False,6,"While racing to a boxing match, Frank, Mike, J...",Don't move. Don't whisper. Don't even breathe.,...,0,4,10,2
4,False,8,Timo Novotny labels his new project an experim...,A Megacities remix.,...,0,5,1,1
6,False,11,Princess Leia is captured and held hostage by ...,"A long time ago in a galaxy far, far away...",...,0,3,5,3
7,False,12,"Nemo, an adventurous young clownfish, is unexp...",There are 3.7 trillion fish in the ocean. They...,...,0,5,5,3


The text will not require as much preprocessing as we will be using transformer based models to deal with the text data. We will look into that in greater detail later. Now that we are done with the basic preprocessing and feature engineering, we can finally create our train, validation and test splits in separate csv files. 

Looking at the first 5 rows of the cleaned dataset, we easily see that some example taglines (under the tagline column) are missing. We'll need to separate these examples into a separate csv file as they do not have labels. Additionally, we can create our train, validation and test datasets concurrently and save them into separate csv files. This helps reproducibility later on. We also print the relative proportions of each dataset to see if we will need to redo the extraction process above. 

In [None]:
#export

def create_splits(df: DataFrame,
                  label: str,
                  splits: list,
                  seed: int,
                  keep_missing: bool,
                  save_path: str = "."):

    assert len(splits) == 2, "Train, validation and test splits must be provided, please provide 2 of them as fractions."
    if keep_missing:
        unlabelled_df = df[df[label] == '']
        unlabelled_df.to_csv(os.path.join(save_path, "tagless.csv"))
        print(f"Tagless set size: {len(unlabelled_df)}")
        print("Tagless dataset created!")
    labelled_df = df[df[label] != '']
    df_size = len(labelled_df)
    labelled_df = shuffle(labelled_df, random_state=seed)
    labelled_df.reset_index(inplace=True, drop=True)
    valid_start, test_start = int(df_size*splits[0]), int(df_size*splits[0] + df_size*splits[1])
    train_df = labelled_df.iloc[:valid_start]
    valid_df = labelled_df.iloc[valid_start:test_start]
    test_df = labelled_df[test_start:]
    print(f"Train set size: {len(train_df)}\nValid set size: {len(valid_df)}\nTest set size: {len(test_df)}")
    train_df.to_csv(os.path.join(save_path, "train.csv"), index=False)
    valid_df.to_csv(os.path.join(save_path, "valid.csv"), index=False)
    test_df.to_csv(os.path.join(save_path, "test.csv"), index=False)
    print("Train, Validation and Test datasets created!")

In [None]:
splits = [0.8, 0.1]
label = "tagline"
seed = 42
create_splits(df=df,
              label=label,
              splits=splits,
              seed=seed,
              keep_missing=True)

Tagless set size: 0
Tagless dataset created!
Train set size: 2276
Valid set size: 285
Test set size: 285
Train, Validation and Test datasets created!
