### Disney Dataset Creation (BS4)

#### Scrape & clean a list of disney wiki pages to create dataset to analyze information
#### One box

In [1]:
from bs4 import BeautifulSoup as bs
import requests 

In [None]:
page=requests.get('https://en.wikipedia.org/wiki/Toy_Story_3')
soup=bs(page.content)

In [None]:
#Scraping table
info_box=soup.find(class_='infobox vevent')

info_rows=info_box.find_all('tr')

for x in info_rows:
    print(x.prettify())

In [None]:
#Bulding dictionares

movie_info={}

def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ",strip=True).replace("\xa0"," ") for li in row_data.find_all('li')]
    else :
        return row_data.get_text(" ",strip=True).replace("\xa0"," ")


for index, row in enumerate(info_rows):  #funkcja enumerate po to aby automatycznie tworzyc indeksy
    if index == 0:
        movie_info['title']=row.find('th').get_text(" ",strip=True)
        
    elif index == 1:
        continue
    else:
        content_key=row.find('th').get_text(" ",strip=True)
        content_value=get_content_value(row.find('td'))
        movie_info[content_key]=content_value

        
movie_info

#### Scraping all box for all disney movies

In [None]:
page=requests.get('https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films')
soup=bs(page.content)

In [None]:
movies=soup.select('.wikitable.sortable i')
movies[0]

In [None]:
def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ",strip=True).replace("\xa0"," ") for li in row_data.find_all('li')]
    else :
        return row_data.get_text(" ",strip=True).replace("\xa0"," ")


def get_info_box(url):
    
    page=requests.get(url)
    soup=bs(page.content)
    
    info_box=soup.find(class_='infobox vevent')
    info_rows=info_box.find_all('tr')

    movie_info={}
    for index, row in enumerate(info_rows):  #funkcja enumerate po to aby automatycznie tworzyc indeksy
        if index == 0:
            movie_info['title']=row.find('th').get_text(" ",strip=True)

        elif index == 1:
            continue
        else:
            content_key=row.find('th').get_text(" ",strip=True)
            content_value=get_content_value(row.find('td'))
            movie_info[content_key]=content_value
    return movie_info

In [None]:
page=requests.get('https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films')
soup=bs(page.content)
movies=soup.select('.wikitable.sortable i a')

base_path="https://en.wikipedia.org/"

movie_info_list=[]

for index, movie in enumerate(movies):
    try:
        relative_path=movie['href']
        full_path=base_path + relative_path
        title=movie['title']

        movie_info_list.append(get_info_box(full_path))


    except Exception as e:
        print(movie.get_text())
        print(e)        

#### Save/Reload Movie Data in json file

In [None]:
import json

def save_data(title, data):
    with open(title,'w',encoding='utf-8') as f:
        json.dumb(data,f,enscure_ascii=False, indent=2 )

In [None]:
import json 

def load_data(title):
    with open(title,encoding='utf-8') as f:
        return json.load(f)
    
#save_data('disney_data.json',movie_info_list)

## Clean Data

In [None]:
movies_data=load_data('disney_data2.json')

##### Substask
- Celan up references [1] 
- Convert running time into a intnger
- Convert dates into datetime object
- Split up the long strings -  done
- Convert budget and box office to numbers

In [None]:
#Celan up references [1], [2] etc / doubled film premiere
#Split up the long strings 

In [None]:
def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ",strip=True).replace("\xa0"," ") for li in row_data.find_all('li')]
    elif row_data.find('br'): # tworzenie lsty dla  aktorów wystepujacych w filmie, tam gdzie tego nie było / split up long string
        return [text for text in row_data.stripped_strings]        
    else :
        return row_data.get_text(" ",strip=True).replace("\xa0"," ")

def clean_tags(soup): # usuwanie [1],[2] etc oraz zbugowanych podwojnych dat wydania filmu
    for tag in soup.find_all(["sup","span"]):
        tag.decompose()    
    

def get_info_box(url):
    
    page=requests.get(url)
    soup=bs(page.content)
    
    info_box=soup.find(class_='infobox vevent')
    info_rows=info_box.find_all('tr')
    
    clean_tags(soup)
    
    movie_info={}
    for index, row in enumerate(info_rows):  #funkcja enumerate po to aby automatycznie tworzyc indeksy
        if index == 0:
            movie_info['title']=row.find('th').get_text(" ",strip=True)

        else:
            header=row.find('th')
            if header:
                content_key=row.find('th').get_text(" ",strip=True)
                content_value=get_content_value(row.find('td'))
                movie_info[content_key]=content_value
    return movie_info

In [None]:
get_info_box('https://en.wikipedia.org/wiki/Toy_Story_3')

In [None]:
#Convert running time into a integer

In [None]:
[movie.get('Running time','N/A') for movie in movies_data]

In [None]:
" 85 minutes"
def minute_to_intiger(running_time):
    if running_time == 'N/A':
        return None
    
    if isinstance(running_time,list):
        return int(running_time[0].split(' ')[0])     
    else:
        return int(running_time.split(' ')[0])
    
for movie in movies_data:
    movie['Running time (int)']=minute_to_intiger(movie.get('Running time','N/A'))

In [None]:
[movie.get('Running time (int)','N/A') for movie in movies_data]

In [None]:
#Convert budget and box office to numbers

In [None]:
import re

number=r"\d+(,\d{3})*\.*\d*"
amount = r"thousand|million|billion"
value_re = rf'\${number}'
word_re = rf'\${number}(-|\sto\s|–)?({number})?\s({amount})'

def word_to_value(word):
    value_dict = {'thousand':1000,'million':1000000,'billion':1000000000}
    return value_dict[word]


def parse_word_syntax(string):
    value_string=re.search(number,string).group()
    value = float(value_string.replace(',', ''))
    word = re.search(amount, string,flags=re.I).group().lower()
    word_value= word_to_value(word)
    return value * word_value


def parse_value_syntax(string):
    value_string=re.search(number,string).group()
    value=float(value_string.replace(',',''))
    return value

def money_conversion(money):
    if money =='N/A':
        return None
    if isinstance(money,list):
        money=money[0]

    word_syntax = re.search(word_re,money,flags=re.I)
    value_syntax = re.search(value_re,money)
    if word_syntax:
        return parse_word_syntax(word_syntax.group())
    elif value_syntax:
        return parse_value_syntax(value_syntax.group())

    else:
        return None

In [None]:
for movie in movies_data:
    movie['Budget (float)']=money_conversion(movie.get('Budget','N/A'))
    movie['Box office (float)']=money_conversion(movie.get('Box office','N/A'))  

In [2]:
#Convert dates into datetime object

In [None]:
[movie.get('Release date','N/A') for movie in movies_data]

In [None]:
from datetime import datetime

dates=[movie.get('Release date','N/A') for movie in movies_data]

def clean_date(date):
    return date.split("(")[0].strip()

def date_conversion(date):
    if isinstance(date,list):
        date=date[0]
    if date =="N/A":
        return None
    date_str=clean_date(date)

    fmts=["%B %d, %Y","%d %B, %Y"]
    for fmt in fmts:
        try:
            return datetime.strptime(date_str,fmt)
        except:
            pass
    return None       

In [None]:
for movie in movies_data:
    movie['Release date (datetime)']=date_conversion(movie.get('Release date','N/A'))

#### Save new data as  `. pickle` formats

In [None]:
import pickle

def save_data_pickle(name,data):
    with open(name,'wb') as f:
        pickle.dump(data,f)   

In [None]:
import pickle

def load_data_pickle(name):
    with open(name,'rb') as f:
        return pickle.load(f)

In [None]:
save_data_pickle('disney_data3.pickle',movies_data)

In [None]:
movies_data2=load_data_pickle('disney_data3.pickle')

### Atach IMDB/Rotten Tomatoes scores

#### Working with API

In [None]:
import requests
import urllib

In [None]:
api_key='key'

api_key2='key'


def get_ombd_info(title):
    base_url='http://www.omdbapi.com/'
    parameters = {'apikey':'3e689679','t':title}
    params_encoded= urllib.parse.urlencode(parameters)
    base_url='http://www.omdbapi.com/?'
    full_url = base_url + params_encoded
    return requests.get(full_url).json()

def get_rotten_tomato_score(ombd_info):
    ratings=ombd_info.get('Ratings',[])
    
    for rating in ratings:
        if rating['Source'] =='Rotten Tomatoes':
            return rating['Value']

    return None

get_ombd_info('avengers endgame')

#get_rotten_tomato_score(info)

In [None]:
for movie in movies_data2:
    title=movie['title']
    ombd_info = get_ombd_info(title)
    movie['imbd'] = ombd_info.get('imdbRating',None)
    movie['metascore']=ombd_info.get('Metascore',None)
    movie['rotten_tomatoes'] = get_rotten_tomato_score(ombd_info)

In [None]:
save_data_pickle('disney_data_final.pickle',movies_data2)

### Saving data as JSON & CSV

In [None]:
movies_data2_copy=[movie.copy() for movie in movies_data2]

In [None]:
for movie in movies_data2_copy:
    current_date=movie['Release date (datetime)']
    if current_date:
        movie['Release date (datetime)'] = current_date.strftime("%B %d, %Y")
    else:
        movie['Release date (datetime)'] = None

In [None]:
import json

def save_data(title, data):
    with open(title,'w',encoding='utf-8') as f:
        json.dump(data,f,ensure_ascii=False, indent=2 )

In [None]:
save_data('disney_data_final.json', movies_data2_copy)