# Data gathering from Boxoffice Mojo website and OMDB api

In [None]:
# scrapping data for movies boxoffice from www.boxofficemojo.com

import re
import pandas as pd
import requests
from bs4 import BeautifulSoup

final_list = []

#scrape all tables for boxoffice data using Beautifulsoup
for i in range(1, 48000,200):
    try:
        page = 'https://www.boxofficemojo.com/chart/ww_top_lifetime_gross/?area=XWW&offset=' + str(i)
        resp = requests.get(page)
        soup = BeautifulSoup(resp.text, 'lxml')
        table_data = [x.text for x in soup.select('tr td')[0:1600]]  # trial and error to get the exact positions
        temp_list = [table_data[i:i+8] for i in range(0, len(table_data[:-7]), 8)] # put every 5 values in a row
        for temp in temp_list:
            final_list.append(temp)
    except Exception as e:
        break

regex = '|'.join(['\$', ',', '\^'])

# making a dataframe using the data
columns = ['rank', 'title', 'Worldwide_lifetime_Gross','Domestic_Lifetime_Gross','Domestic_%','Foreign_Lifetime_Gross', 'Foreign_%','year']
boxoffice_df = pd.DataFrame({
    'rank': [(x[0]) for x in final_list],  # convert ranks to integers
    'title': [x[1] for x in final_list],  # get titles as is
    'Worldwide_Lifetime_Gross': [int(re.sub(regex, '', x[2])) for x in final_list],  # remove special characters and convert to integer
    'Domestic_Lifetime_Gross':[(re.sub(regex, '', x[3])) for x in final_list],
    'Domestic_%':[(re.sub(regex, '', x[4])) for x in final_list],
    'Foreign_Lifetime_Gross':[int(re.sub(regex, '', x[5])) for x in final_list],
    'Foreign_%':[(re.sub(regex, '', x[6])) for x in final_list],
    'year': [int(re.sub(regex, '',str(x[7]))) for x in final_list],  # remove special characters and convert to integer
})

# adding the first row manually
line1=pd.DataFrame({'rank':'1','title':'Avatar','Worldwide_Lifetime_Gross':'2847246203','Domestic_Lifetime_Gross':'760507625','Domestic_%':'26.7%','Foreign_Lifetime_Gross':'2086738578','Foreign_%' :'73.3%','year':'2009'},index=[0])
boxoffice = pd.concat([line1,boxoffice_df],ignore_index=True)

#changing the data type and delete unwanted character
boxoffice['rank'].replace(',','', regex=True, inplace=True)
boxoffice = boxoffice.astype({'rank': int, 'title': str,'Worldwide_Lifetime_Gross':int,'year':int})

#delete unwanted columns
boxoffice.drop(columns = ['rank','Domestic_%','Foreign_%'], inplace=True,)

#saving to csv
boxoffice.to_csv('/Users/Amin/Documents/GitHub/Movie_boxoffice_reviews/data/interim/mojo_clean.csv')


In [None]:
# Getting the data for Metacritic from OMDB api
import pandas as pd
import requests

#getting the imdb_id row from imdb file to use it for requesting api
imdb = pd.read_csv('/Users/Amin/Documents/GitHub/Movie_boxoffice_reviews/data/interim/IMDB_clean.csv')
imdb = imdb[imdb['year'] >= 2010]
imdb = imdb[['imdb_title_id','year']]
imdb = imdb.sort_values('year',ignore_index=True,ascending=False)
id_list = imdb['imdb_title_id'].tolist()

#getting single request to get the columns name
url = ('http://www.omdbapi.com/?i=tt2850272&apikey=e8935e9')
r = requests.get(url)
json_data = r.json()
col = json_data.keys()

#creating an empty dataframe using the columns name
omdb = pd.DataFrame(columns=col)

#use the id list to request all the data from the api and append it to the dataframe
for id in id_list:
    url = ('http://www.omdbapi.com/?i='+id + '&apikey=e8935e9')
    r = requests.get(url)
    json_data = r.json()
    omdb = omdb.append(json_data, ignore_index=True)

#selecting the columns that needed for merge and Metascore value
omdb = omdb[['imdbID','Title','Director','Year','Released','Runtime','Metascore']]
omdb.dropna(subset=['Metascore'],inplace = True)


#save the dataframe to csv file 
omdb.to_csv('/Users/Amin/Documents/GitHub/Movie_boxoffice_reviews/data/interim/Omdb_clean.csv')
    