In [None]:
# default_exp clean

# Data Cleaning and Feature Engineering
> Cleaning and feature engineering based on the insights gained from the previous step on EDA

Looking at the first 5 rows of the scraped dataset, we easily see that some example taglines (under the tagline column) are missing. We'll need to separate these examples into a separate csv file as they do not have labels. Additionally, we can create our train, validation and test datasets concurrently and save them into separate csv files. This helps reproducibility later on. We also print the relative proportions of each dataset to see if we will need to redo the extraction process above. 

In [None]:
#exporti
from pandas import DataFrame
from sklearn.utils import shuffle
import os
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
import json
import concurrent
import requests
import pandas as pd
import numpy as np

In [None]:
#export

def create_splits(df: DataFrame,
                  label: str,
                  splits: list,
                  seed: int,
                  keep_missing: bool,
                  save_path: str = "."):

    assert len(splits) == 2, "Train, validation and test splits must be provided, please provide 2 of them as fractions."
    if keep_missing:
        unlabelled_df = df[df[label] == '']
        unlabelled_df.to_csv(os.path.join(save_path, "tagless.csv"))
        print(f"Tagless set size: {len(unlabelled_df)}")
        print("Tagless dataset created!")
    labelled_df = df[df[label] != '']
    df_size = len(labelled_df)
    labelled_df = shuffle(labelled_df, random_state=seed)
    labelled_df.reset_index(drop=True, inplace=True)
    valid_start, test_start = int(df_size*splits[0]), int(df_size*splits[0] + df_size*splits[1])
    train_df = labelled_df.iloc[:valid_start]
    valid_df = labelled_df.iloc[valid_start:test_start]
    test_df = labelled_df[test_start:]
    print(f"Train set size: {len(train_df)}\nValid set size: {len(valid_df)}\nTest set size: {len(test_df)}")
    train_df.to_csv(os.path.join(save_path, "train.csv"))
    valid_df.to_csv(os.path.join(save_path, "valid.csv"))
    test_df.to_csv(os.path.join(save_path, "test.csv"))
    print("Train, Validation and Test datasets created!")

In [None]:
splits = [0.7, 0.15]
label = "tagline"
seed = 42
# create_splits(df=movies,
#               label=label,
#               splits=splits,
#               seed=seed,
#               keep_missing=True)

In [None]:

def drop_col(data: DataFrame)->DataFrame:
        
        df = data.drop(['belongs_to_collection','homepage','imdb_id','production_companies','popularity','original_language','original_title','revenue','runtime','spoken_languages','status','video','vote_average','vote_count','production_countries','budget'],axis = 1)
        df['release_date'] = pd.to_datetime(df['release_date'], format='%Y-%m-%d')
        df =df.drop(df.columns[0], axis=1)
        
        return df

In [None]:
def clean_genre(df:DataFrame)-> DataFrame:
    
    mlb = MultiLabelBinarizer(sparse_output=True)
    df['genres'] = [[x['name'] for x in eval(list_dict)] for list_dict in df['genres']]
    df1 = df.join(
                pd.DataFrame.sparse.from_spmatrix(
                mlb.fit_transform(df.pop('genres')),
                index=df.index,
                columns=mlb.classes_))
    return df1

In [None]:
def extract_backdrop_img(df:DataFrame):
    
    try:
        os.makedirs(".\\backdrop_img")
    except FileExistsError:
    # directory already exists
        pass
    
    for cnt,x in enumerate(df['backdrop_path']):
        
        if x is not None:
            response = requests.get("https://image.tmdb.org/t/p/original"+str(x))
            file = open(".\\backdrop_img\\"+str(df['id'][cnt])+".jpg", "wb")
            file.write(response.content)
            file.close()
    
    return
    

In [None]:
def extract_poster_img(df:DataFrame):
    
    try:
        os.makedirs(".\\poster_img")
    except FileExistsError:
    # directory already exists
        pass
    
    for cnt,x in enumerate(df['poster_path']):
        
        if x is not None:
            response = requests.get("https://image.tmdb.org/t/p/original"+str(x))
            file = open(".\\poster_img\\"+str(df['id'][cnt])+".jpg", "wb")
            file.write(response.content)
            file.close()
    
    return