In [1]:
import pandas as pd
import numpy as np
import os as os
import requests
import json
from config import api_key

In [27]:
#Data cleaned
metadata = os.path.join('Resources','cleaned','movies_metadata__clean.csv')
detailed = os.path.join('Resources','cleaned','Detail_Movies.csv')
rotten = os.path.join('Resources','cleaned','rotten_tomatoes_clean.csv')
netflix = os.path.join('Resources','cleaned','netflix_clean.csv')
output = os.path.join('Resources','cleaned','IMBd movies 2.csv')

#Data normal
imbd = os.path.join('Resources','IMDb movies.csv')

#URL fro omdbapi
url = f'http://www.omdbapi.com/?apikey={api_key}&t='

In [3]:
metadata_df = pd.read_csv(metadata)
detailed_df = pd.read_csv(detailed)
rotten_df = pd.read_csv(rotten)
netflix_df = pd.read_csv(netflix)
imbd_df = pd.read_csv(imbd)

In [4]:
#Comparing columns
display(metadata_df.columns)
display(detailed_df.columns)

Index(['Unnamed: 0', 'imdb_id', 'original_title', 'original_language',
       'popularity', 'runtime', 'vote_average', 'vote_count', 'release_date',
       'revenue', 'genres', 'production_companies'],
      dtype='object')

Index(['Unnamed: 0', 'imdb_id', 'original_title', 'genres',
       'original_language', 'popularity', 'production_companies', 'runtime',
       'vote_average', 'vote_count', 'release_date', 'revenue'],
      dtype='object')

In [5]:
#Reordering columns in metadata_df
metadata_df_new = metadata_df[['Unnamed: 0', 'imdb_id', 'original_title', 'genres',
       'original_language', 'popularity', 'production_companies', 'runtime',
       'vote_average', 'vote_count', 'release_date', 'revenue']]

In [6]:
#Comparing DataFrames
display(metadata_df_new.head(3))
display(detailed_df.loc[detailed_df['original_title'] == 'Jumanji'])

Unnamed: 0.1,Unnamed: 0,imdb_id,original_title,genres,original_language,popularity,production_companies,runtime,vote_average,vote_count,release_date,revenue
0,0,tt0114709,Toy Story,Animation,en,21.946943,Pixar Animation Studios,81.0,7.7,5415.0,1995-10-30,373554033.0
1,1,tt0113497,Jumanji,Adventure,en,17.015539,TriStar Pictures,104.0,6.9,2413.0,1995-12-15,262797249.0
2,2,tt0113228,Grumpier Old Men,Romance,en,11.7129,Warner Bros.,101.0,6.5,92.0,1995-12-22,0.0


Unnamed: 0.1,Unnamed: 0,imdb_id,original_title,genres,original_language,popularity,production_companies,runtime,vote_average,vote_count,release_date,revenue
4288,4288,tt0113497,Jumanji,Adventure,en,3.594827,TriStar Pictures,104.0,6.8,2032,15/12/1995,262797249


In [7]:
# Concatenate dataframes without duplicates
meta_com_det = pd.concat([metadata_df_new,detailed_df]).drop_duplicates(subset='imdb_id').reset_index(drop=True)

#Print the rows we deleted
print(f'{(metadata_df_new.shape[0] + detailed_df.shape[0]) - meta_com_det.shape[0]} rows deleted')

123641 rows deleted


In [8]:
meta_com_det.head(2)

Unnamed: 0.1,Unnamed: 0,imdb_id,original_title,genres,original_language,popularity,production_companies,runtime,vote_average,vote_count,release_date,revenue
0,0,tt0114709,Toy Story,Animation,en,21.946943,Pixar Animation Studios,81.0,7.7,5415.0,1995-10-30,373554033.0
1,1,tt0113497,Jumanji,Adventure,en,17.015539,TriStar Pictures,104.0,6.9,2413.0,1995-12-15,262797249.0


In [9]:
#Netflix VS meta_com_det
display(netflix_df.head(2))


Unnamed: 0.1,Unnamed: 0,show_id,type,title,director,release_year,rating,duration,Genre,Genre 2,Genre 3
0,0,81145628,Movie,Norm of the North: King Sized Adventure,"Richard Finn, Tim Maltby",2019,TV-PG,90 min,Children & Family Movies,Comedies,
1,1,80117401,Movie,Jandino: Whatever it Takes,,2016,TV-MA,94 min,Stand-Up Comedy,,


In [10]:
#Re-structure the Netflix table 
netflix_df.drop(['Genre 2','Genre 3','show_id','type','duration','rating','release_year','director'], axis=1, inplace=True)
netflix_df.rename(columns={"title": "original_title", "Genre": "genres"}, inplace=True)
dict_net = netflix_df.to_dict('records')
netflix_df.head(2)

Unnamed: 0.1,Unnamed: 0,original_title,genres
0,0,Norm of the North: King Sized Adventure,Children & Family Movies
1,1,Jandino: Whatever it Takes,Stand-Up Comedy


In [11]:
#Preparing tables to merge
rotten_df.drop(['Rank','RatingTomatometer','No. of Reviews','Genre 2','Year'], axis=1, inplace=True)
rotten_df.rename(columns={"Title": "original_title", "Genre": "genres"}, inplace=True)
display(rotten_df.head(2))

Unnamed: 0.1,Unnamed: 0,original_title,genres
0,0,Black Panther,action
1,1,Mad Max: Fury Road,action


In [12]:
#Merge Netflix and Rotten dataframes
net_com_rott = pd.concat([netflix_df,rotten_df]).drop_duplicates(subset='original_title').reset_index(drop=True)

In [14]:
#Check if Netflix have movies that imbd dataframe dont have
dict_net_rott = net_com_rott.to_dict('records')

num1 = 0
num2 = 0

for item in dict_net:
    if len(meta_com_det.loc[meta_com_det['original_title'] == item['original_title']]) == 1:
        num1 += 1
    else:
        num2 += 1


print(f'Movies in imbd DataFrame: {num1}')
print(f'Movies not in imbd DataFrame: {num2}')

Movies in imbd DataFrame: 1702
Movies not in imbd DataFrame: 2563


In [23]:
#Get data from imbd dataframe and ombd API

for item in dict_net_rott:

    imbd_item = meta_com_det.loc[meta_com_det['original_title'] == item['original_title']]

    if len(imbd_item) == 1:
        item.update({'imdb_id': imbd_item['imdb_id'].tolist()[0]})
        item.update({'original_language': imbd_item['original_language'].tolist()[0]})
        item.update({'popularity': imbd_item['popularity'].tolist()[0]})
        item.update({'production_companies': imbd_item['production_companies'].tolist()[0]})
        item.update({'runtime': imbd_item['runtime'].tolist()[0]})
        item.update({'vote_average': imbd_item['vote_average'].tolist()[0]})
        item.update({'vote_count': imbd_item['vote_count'].tolist()[0]})
        item.update({'release_date': imbd_item['release_date'].tolist()[0]})
        item.update({'revenue': imbd_item['revenue'].tolist()[0]})
    
    else:
        try:
            response = requests.get(url + item['original_title'])
            data = response.json()
            if data['Response'] == 'True':
                item.update({'imdb_id': data['imdbID']})
                item.update({'original_language': data['Language']})
                item.update({'popularity': np.nan})
                item.update({'production_companies': np.nan})
                item.update({'runtime': data['Runtime']})
                item.update({'vote_average': data['imdbRating']})
                item.update({'vote_count': data['imdbVotes']})
                item.update({'release_date': data['Released']})
                item.update({'revenue': np.nan})
            else:
                continue
        except:
            continue

In [24]:
#Converting dict to dataframe
net_rott_df = pd.DataFrame(dict_net_rott)

#Merge net_com_rott with meta_com_det
all_merge = pd.concat([meta_com_det,net_com_rott]).drop_duplicates(subset='imdb_id').reset_index(drop=True)
all_merge.shape

(250869, 12)

In [25]:
all_merge.head(2)

Unnamed: 0.1,Unnamed: 0,imdb_id,original_title,genres,original_language,popularity,production_companies,runtime,vote_average,vote_count,release_date,revenue
0,0,tt0114709,Toy Story,Animation,en,21.946943,Pixar Animation Studios,81.0,7.7,5415.0,1995-10-30,373554033.0
1,1,tt0113497,Jumanji,Adventure,en,17.015539,TriStar Pictures,104.0,6.9,2413.0,1995-12-15,262797249.0


In [28]:
all_merge.to_csv(output)