In [27]:
# import modules

import pandas as pd
import numpy as np
import os
from tqdm import tqdm
from PIL import Image
import json

In [28]:
# read files

# get current folder
current_folder = os.path.abspath('')

maps = [
     f"{current_folder}/meta/2008.json",
     f"{current_folder}/meta/2009.json",
     f"{current_folder}/meta/2010.json"     
]

def get_meta(map):
     with open(map, 'rb') as j:
          return json.loads(j.read())

meta_2008_df = pd.DataFrame(get_meta(maps[0]))
meta_2009_df = pd.DataFrame(get_meta(maps[1]))
meta_2010_df = pd.DataFrame(get_meta(maps[2]))

print(meta_2008_df.shape)
print(meta_2009_df.shape)
print(meta_2010_df.shape)

(357, 20)
(318, 20)
(302, 20)


In [29]:
# clean and concat data sets

# concat the dataframes
first_df = [meta_2009_df[['imdbID', 'Genre']], meta_2008_df[['imdbID', 'Genre']], meta_2010_df[['imdbID', 'Genre']]]
df = pd.concat(first_df, ignore_index=True)

# split unique genres
genres = df['Genre'].str.get_dummies(', ')
df_enc = pd.concat([df['imdbID'], genres], axis=1)

#Remove NaN valued entries
df_enc = df_enc.drop('N/A', axis=1)

print(df.shape)
print(df_enc.shape)
df_enc.head()

(977, 2)
(977, 27)


Unnamed: 0,imdbID,Action,Adult,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,...,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Thriller,War,Western
0,tt0499549,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,tt1055369,1,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,tt0417741,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,tt1259571,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,tt1049413,0,0,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
# remove movies without image

map = f"{current_folder}/raw_images/"

def check_file(row):
  image_path = f"{map}{row['imdbID']}.jpg"
  return os.path.isfile(image_path)

# drop duplicate dataframe
df = df_enc.drop_duplicates(subset='imdbID', keep='first')

# apply filter on dataframe
df['file_exists'] = df.apply(check_file, axis=1)

# filter out rows where the corresponding file doesn't exist
df = df[df['file_exists']]

# remove the 'file_exists' column
df = df.drop('file_exists', axis=1)

print(df.shape)
df.head()

(875, 27)


Unnamed: 0,imdbID,Action,Adult,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,...,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Thriller,War,Western
0,tt0499549,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,tt1055369,1,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,tt0417741,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,tt1259571,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,tt1049413,0,0,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
# write to csv file

map = f"{current_folder}/data/"

df.to_csv(f"{map}cleaned_genres.csv", index = None)

# split data in train, validate and test
random_seed = 50
train_df = df.sample(frac=0.7, random_state=random_seed) #Taking 70% of the data
tmp_df = df.drop(train_df.index)
test_df = tmp_df.sample(frac=0.1, random_state=random_seed) #Taking 20% of the remaining (after train is taken)
valid_df = tmp_df.drop(test_df.index)

print("Cleaned =",len(df))
print("Train_df =",len(train_df))
print("Val_df =",len(valid_df))
print("Test_df =",len(test_df))

train_df.to_csv(f"{map}train.csv", header=False, index=False)
test_df.to_csv(f"{map}test.csv", header=False, index=False)
valid_df.to_csv(f"{map}valid.csv", header=False, index=False)

Cleaned = 875
Train_df = 612
Val_df = 237
Test_df = 26
