## Connect to Google Drive

In [1]:
# from google.colab import drive
# drive.mount('/content/gdrive')

## Constats

In [2]:
data_folder = 'Data/the-movies-dataset/'
image_folder_150_225 = 'Images_150x225/'
image_folder_300_450 = 'Images_300x450/'
train_folder = 'Train/'
validation_folder = 'Validation/'

image_extensions = ".jpg"
posters_base_http = 'https://image.tmdb.org/t/p/w600_and_h900_bestv2'
posters_base_http_150x225 = 'https://image.tmdb.org/t/p/w150_and_h225_bestv2'
posters_base_http_300x450 = 'https://image.tmdb.org/t/p/w300_and_h450_bestv2'

## Libraries

In [3]:
import os
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
from IPython.display import HTML, display
import tqdm
import urllib
import urllib.request
%matplotlib inline

## Preprocess data

In [4]:
posters_dataset = pd.read_csv(data_folder + 'movies_metadata.csv',  sep=',', low_memory=False)
# posters_dataset.head()

In [5]:
posters_dataset = posters_dataset[['imdb_id', 'original_language', 'original_title', 'title', 'poster_path', 'genres', 'vote_count']]
print(posters_dataset.shape)
posters_dataset.head()

(45466, 7)


Unnamed: 0,imdb_id,original_language,original_title,title,poster_path,genres,vote_count
0,tt0114709,en,Toy Story,Toy Story,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",5415.0
1,tt0113497,en,Jumanji,Jumanji,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",2413.0
2,tt0113228,en,Grumpier Old Men,Grumpier Old Men,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",92.0
3,tt0114885,en,Waiting to Exhale,Waiting to Exhale,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",34.0
4,tt0113041,en,Father of the Bride Part II,Father of the Bride Part II,/e64sOI48hQXyru7naBFyssKFxVd.jpg,"[{'id': 35, 'name': 'Comedy'}]",173.0


In [6]:
posters_dataset = posters_dataset.drop(posters_dataset[(posters_dataset["poster_path"].str.contains('/') == False) 
                                                        | (posters_dataset["poster_path"].isnull() == True)
                                                        | (posters_dataset["title"].isnull() == True)
                                                        | (posters_dataset["imdb_id"].str.startswith('tt') == False)
                                                        | (posters_dataset["vote_count"] < 40)
                                                        | (posters_dataset["genres"].str.contains('\[\{') == False)
                                                      ].index)
posters_dataset = posters_dataset.drop_duplicates(keep=False)
print(posters_dataset.shape)
posters_dataset.sort_values(by=['vote_count'], ascending=False).head(15)

(10419, 7)


Unnamed: 0,imdb_id,original_language,original_title,title,poster_path,genres,vote_count
15480,tt1375666,en,Inception,Inception,/qmDpIHrmpJINaRKAfWQfftjCdyi.jpg,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",14075.0
12481,tt0468569,en,The Dark Knight,The Dark Knight,/1hRoyzDtpgMU7Dz4JF22RANzQO7.jpg,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",12269.0
14551,tt0499549,en,Avatar,Avatar,/kmcqlZGaSh20zpTbuoF0Cdn07dT.jpg,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",12114.0
17818,tt0848228,en,The Avengers,The Avengers,/cezWGskPY5x7GaglTTRN4Fugfb8.jpg,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",12000.0
26564,tt1431045,en,Deadpool,Deadpool,/inVq3FRqcYIRl2la8iZikYYxFNR.jpg,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",11444.0
22879,tt0816692,en,Interstellar,Interstellar,/nBNZadXqJSdt05SHLqgT0HuC5Gm.jpg,"[{'id': 12, 'name': 'Adventure'}, {'id': 18, '...",11187.0
20051,tt1853728,en,Django Unchained,Django Unchained,/5WJnxuw41sddupf8cwOxYftuvJG.jpg,"[{'id': 18, 'name': 'Drama'}, {'id': 37, 'name...",10297.0
23753,tt2015381,en,Guardians of the Galaxy,Guardians of the Galaxy,/y31QB9kn3XSudA15tV7UWQ9XLuW.jpg,"[{'id': 28, 'name': 'Action'}, {'id': 878, 'na...",10014.0
2843,tt0137523,en,Fight Club,Fight Club,/adw6Lq9FiC9zjYEpOqfq03ituwp.jpg,"[{'id': 18, 'name': 'Drama'}]",9678.0
18244,tt1392170,en,The Hunger Games,The Hunger Games,/iLJdwmzrHFjFwI5lvYAT1gcpRuA.jpg,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",9634.0


In [7]:
def add_base_http(link):
    return str(posters_base_http_300x450 + str(link))

In [8]:
def get_first_genre(genres):
    genre = genres.split("}")[0]
    genre_name = genre.split("'name': ")[1][1:-1]
    return genre_name

In [9]:
posters_dataset['poster_path'] = posters_dataset['poster_path'].apply(add_base_http)
posters_dataset['genres'] = posters_dataset['genres'].apply(get_first_genre)
posters_dataset.head()

Unnamed: 0,imdb_id,original_language,original_title,title,poster_path,genres,vote_count
0,tt0114709,en,Toy Story,Toy Story,https://image.tmdb.org/t/p/w300_and_h450_bestv...,Animation,5415.0
1,tt0113497,en,Jumanji,Jumanji,https://image.tmdb.org/t/p/w300_and_h450_bestv...,Adventure,2413.0
2,tt0113228,en,Grumpier Old Men,Grumpier Old Men,https://image.tmdb.org/t/p/w300_and_h450_bestv...,Romance,92.0
4,tt0113041,en,Father of the Bride Part II,Father of the Bride Part II,https://image.tmdb.org/t/p/w300_and_h450_bestv...,Comedy,173.0
5,tt0113277,en,Heat,Heat,https://image.tmdb.org/t/p/w300_and_h450_bestv...,Action,1886.0


In [10]:
posters_dataset.genres.unique()

array(['Animation', 'Adventure', 'Romance', 'Comedy', 'Action', 'Family',
       'History', 'Drama', 'Crime', 'Fantasy', 'Science Fiction', 'Music',
       'Horror', 'Mystery', 'Documentary', 'Thriller', 'War', 'Western',
       'TV Movie'], dtype=object)

In [11]:
posters_dataset['genres'][8001:10418].value_counts()

Drama              2500
Comedy             2272
Action             1537
Horror              778
Adventure           627
Thriller            472
Crime               471
Animation           396
Fantasy             268
Documentary         208
Romance             202
Science Fiction     171
Mystery             137
Family              120
Music                62
War                  56
Western              56
History              49
TV Movie             37
Name: genres, dtype: int64

## Import data

In [16]:
for index, row in tqdm.tqdm(posters_dataset[0:8000].iterrows()):
    poster_url = row['poster_path']
    title = row['title'].replace("/", "-")
    imdb_id = row['imdb_id']
    genre_folder = str(row['genres'])
    if not os.path.exists(image_folder_300_450 + train_folder + genre_folder):
        os.makedirs(image_folder_300_450 + train_folder + genre_folder)
    poster_name = image_folder_300_450 + train_folder + genre_folder + "/" + imdb_id + '-' + title + image_extensions

    urllib.request.urlretrieve(poster_url, poster_name)


0it [00:00, ?it/s][A
1it [00:00,  6.23it/s][A
2it [00:00,  6.60it/s][A
3it [00:00,  7.20it/s][A
4it [00:00,  3.92it/s][A
5it [00:01,  4.58it/s][A
6it [00:01,  5.24it/s][A
7it [00:01,  5.55it/s][A
8it [00:01,  6.02it/s][A
9it [00:01,  6.29it/s][A
10it [00:01,  6.94it/s][A
11it [00:01,  7.21it/s][A
12it [00:01,  7.63it/s][A
13it [00:02,  7.62it/s][A
14it [00:02,  7.97it/s][A
15it [00:02,  8.09it/s][A
16it [00:02,  7.64it/s][A
17it [00:02,  7.56it/s][A
18it [00:02,  7.94it/s][A
19it [00:02,  7.38it/s][A
20it [00:03,  7.52it/s][A
21it [00:03,  6.99it/s][A
22it [00:03,  6.64it/s][A
23it [00:03,  6.93it/s][A
24it [00:03,  7.00it/s][A
25it [00:03,  7.01it/s][A
26it [00:03,  7.00it/s][A
27it [00:04,  7.31it/s][A
28it [00:04,  7.34it/s][A
29it [00:04,  7.61it/s][A
30it [00:04,  7.77it/s][A
31it [00:04,  7.32it/s][A
32it [00:04,  7.47it/s][A
33it [00:04,  7.69it/s][A
34it [00:05,  5.28it/s][A
35it [00:05,  5.72it/s][A
36it [00:05,  5.93it/s][A
37it [00:05,  

591it [01:26,  7.52it/s][A
592it [01:26,  7.64it/s][A
593it [01:26,  7.57it/s][A
594it [01:26,  7.52it/s][A
596it [01:27,  6.78it/s][A
597it [01:27,  7.24it/s][A
598it [01:27,  7.45it/s][A
599it [01:27,  7.45it/s][A
600it [01:27,  7.08it/s][A
601it [01:27,  7.04it/s][A
602it [01:27,  7.21it/s][A
[A