In [None]:
# default_exp clean

# Data Cleaning and Feature Engineering
> Cleaning and feature engineering based on the insights gained from the previous step on EDA

In [None]:
#exporti
from pandas import DataFrame
from sklearn.utils import shuffle
from sklearn.preprocessing import MultiLabelBinarizer
from DSAI_proj.extract import *
import os
import pandas as pd
import json
import concurrent
import requests
import pandas as pd
import numpy as np

In [None]:
#hide 
api_key = "785475a69770b7dc1af964feff948dd7"
max_ds_size = 20
max_threads = 8
movies = extract_dataset_threaded(api_key=api_key,
                                  max_ds_size=max_ds_size,
                                  max_threads=max_threads,
                                  save_path='.',
                                  fname='raw_data.csv')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', 10)
pd.set_option('display.width', None)

The first step will be to process the categorical variables of the dataset. In this case, we have genres as a categorical variable. We will use the MultiLabelBinarizer from scikit-learn to one-hot-encode the movie genres. This will create additional columns in our DataFrame, each corresponding to a separate genre type. 

In [None]:
#export

def clean_genre(df: DataFrame)-> DataFrame:
    
    mlb = MultiLabelBinarizer(sparse_output=True)
    df['genres'] = [[x['name'] for x in list_dict] for list_dict in df['genres']]
    df1 = df.join(
                pd.DataFrame.sparse.from_spmatrix(
                mlb.fit_transform(df.pop('genres')),
                index=df.index,
                columns=mlb.classes_))
    return df1

In [None]:
df = clean_genre(df=movies)
df.head()

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,homepage,...,Horror,Mystery,Romance,Science Fiction,Thriller
0,False,/hQ4pYsIbP22TMXOUdSfC2mjWrO0.jpg,,0,,...,0,0,0,0,0
1,False,/l94l89eMmFKh7na2a1u5q67VgNx.jpg,,0,,...,0,0,0,0,0
2,False,/u0zMKKpEdDWpOKmFW2sLbKKICJH.jpg,,4000000,,...,0,0,0,0,0
3,False,/5aXp2s4l6g5PcMMesIj63mx8hmJ.jpg,,21000000,,...,0,0,0,0,1
4,False,,,42000,http://www.lifeinloops.com,...,0,0,0,0,0


Next, we will need to extract the images into a separate directory. This will allow for easie access during the training stage. The following codes help us to extract the backdrop and poster images from their urls to separate directories.

In [None]:
#export

def extract_backdrop_img(df: DataFrame, save_path: str):
    save_path = os.path.join(save_path, "backdrop_img")
    os.makedirs(save_path, exist_ok=True)
    
    for cnt,x in enumerate(df['backdrop_path']):
        
        if x is not None:
            response = requests.get("https://image.tmdb.org/t/p/original"+str(x))
            with open(os.path.join(save_path, str(df['id'][cnt])+".jpg"), "wb") as f:
                f.write(response.content)
    print("All backdrop images written successfully!")
    return

def extract_poster_img(df: DataFrame, save_path: str):
    save_path = os.path.join(save_path, "poster_img")
    os.makedirs(save_path, exist_ok=True)
    
    for cnt,x in enumerate(df['poster_path']):
        
        if x is not None:
            response = requests.get("https://image.tmdb.org/t/p/original"+str(x))
            with open(os.path.join(save_path, str(df['id'][cnt])+".jpg"), "wb") as f:
                f.write(response.content)
    print("All poster images written successfully!")
    return

In [None]:
extract_backdrop_img(df=df, save_path=".")
extract_poster_img(df=df, save_path=".")

All backdrop images written successfully!
All poster images written successfully!


Lastly, from the previous notebook on EDA, we have already identified the relevant and irrelevant features required for our tagline prediction task. We will now drop the columns or features that are irrelevant. We can also include the image url paths to be dropped as we have already extracted the necessary images into a separate folder. 

In [None]:
#export

def drop_col(data: DataFrame,
             irrelevant_cols: list) -> DataFrame:
    df = data.drop(irrelevant_cols,axis = 1)
    df['release_date'] = pd.to_datetime(df['release_date'], format='%Y-%m-%d')
    df = df.drop(df.columns[0], axis=1)
    return df

In [None]:
irrelevant_columns = ['belongs_to_collection','homepage','imdb_id','production_companies','popularity','original_language','original_title','revenue','runtime','spoken_languages','status','video','vote_average','vote_count','production_countries','budget', 'poster_path', 'backdrop_path']
df = drop_col(data=df, irrelevant_cols=irrelevant_columns)
df.head()

Unnamed: 0,id,overview,release_date,tagline,title,...,Horror,Mystery,Romance,Science Fiction,Thriller
0,2,Taisto Kasurinen is a Finnish coal miner whose...,1988-10-21,,Ariel,...,0,0,0,0,0
1,3,"An episode in the life of Nikander, a garbage ...",1986-10-17,,Shadows in Paradise,...,0,0,0,0,0
2,5,It's Ted the Bellhop's first night on the job....,1995-12-09,Twelve outrageous guests. Four scandalous requ...,Four Rooms,...,0,0,0,0,0
3,6,"While racing to a boxing match, Frank, Mike, J...",1993-10-15,Don't move. Don't whisper. Don't even breathe.,Judgment Night,...,0,0,0,0,1
4,8,Timo Novotny labels his new project an experim...,2006-01-01,A Megacities remix.,Life in Loops (A Megacities RMX),...,0,0,0,0,0


The text will not require as much preprocessing as we will be using transformer based models to deal with the text data. We will look into that in greater detail later. Now that we are done with the basic preprocessing and feature engineering, we can finally create our train, validation and test splits in separate csv files. 

Looking at the first 5 rows of the cleaned dataset, we easily see that some example taglines (under the tagline column) are missing. We'll need to separate these examples into a separate csv file as they do not have labels. Additionally, we can create our train, validation and test datasets concurrently and save them into separate csv files. This helps reproducibility later on. We also print the relative proportions of each dataset to see if we will need to redo the extraction process above. 

In [None]:
#export

def create_splits(df: DataFrame,
                  label: str,
                  splits: list,
                  seed: int,
                  keep_missing: bool,
                  save_path: str = "."):

    assert len(splits) == 2, "Train, validation and test splits must be provided, please provide 2 of them as fractions."
    if keep_missing:
        unlabelled_df = df[df[label] == '']
        unlabelled_df.to_csv(os.path.join(save_path, "tagless.csv"))
        print(f"Tagless set size: {len(unlabelled_df)}")
        print("Tagless dataset created!")
    labelled_df = df[df[label] != '']
    df_size = len(labelled_df)
    labelled_df = shuffle(labelled_df, random_state=seed)
    valid_start, test_start = int(df_size*splits[0]), int(df_size*splits[0] + df_size*splits[1])
    train_df = labelled_df.iloc[:valid_start]
    valid_df = labelled_df.iloc[valid_start:test_start]
    test_df = labelled_df[test_start:]
    print(f"Train set size: {len(train_df)}\nValid set size: {len(valid_df)}\nTest set size: {len(test_df)}")
    train_df.to_csv(os.path.join(save_path, "train.csv"))
    valid_df.to_csv(os.path.join(save_path, "valid.csv"))
    test_df.to_csv(os.path.join(save_path, "test.csv"))
    print("Train, Validation and Test datasets created!")

In [None]:
splits = [0.7, 0.15]
label = "tagline"
seed = 42
create_splits(df=df,
              label=label,
              splits=splits,
              seed=seed,
              keep_missing=True)

Tagless set size: 3
Tagless dataset created!
Train set size: 8
Valid set size: 2
Test set size: 2
Train, Validation and Test datasets created!
