# Database Film Disney

**1. Iniziamo raccogliendo da Wikipedia le informazioni su Toy Story 3 e inserendole in un dizionario:**

In [3]:
from bs4 import BeautifulSoup as bs
import requests

In [4]:
r = requests.get('https://en.wikipedia.org/wiki/Toy_Story_3')
soup = bs(r.content)

In [5]:
def get_content_value(row_data):
    if row_data.find('li'):
        return[li.get_text(' ',strip=True).replace('\xa0', '') for li in row_data.find_all('li')]
    else:
        return row_data.get_text(' ',strip=True).replace('\xa0', '')

In [6]:
def dictionarize(soup):
    infobox = soup.find('table', attrs={'class':'infobox vevent'})
    inforows = infobox.find_all('tr')

    movie_info = {}
    for index, row in enumerate(inforows):
        if index == 0:
            movie_info['title'] = row.find('th').get_text()
        elif index == 1:
            continue
        else:
            if row.find('th') is None:
                continue
            else:
                content_key = row.find('th').get_text(' ',strip=True)
            if row.find('td') is None: 
                content_value = None
            else:
                content_value = get_content_value(row.find('td'))
            movie_info[content_key] = content_value
    return(movie_info)

**2. Torniamo alla lista di film Disney**

Vogliamo creare una lista di dizionari, ognuno dei quali contenga info/statistiche su un particolare film.

In [7]:
page = bs(requests.get('https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films').content)

Facciamolo per i film della prima tabella della pagina (ce ne saranno 9 in totale):

In [8]:
first_table = page.find_all('table')[0]

In [9]:
movie_links = [row.select('td i a')[0]['href'] for row in first_table.select('tr')[1:]]

In [149]:
Movies = []
for movie in movie_links:
    link = 'https://en.wikipedia.org' + movie
    soup = bs(requests.get(link).content)
    Movies.append(dictionarize(soup))

Ripetiamo il processo per tutte le tabelle contenenti i film già usciti (9 in totale):

In [213]:
tables = page.find_all('table')[:9]

In [214]:
Movies = []
for table in tables:
    movie_links = []
    for row in table.select('tr')[1:]:
        if row.select('td i a') == []:
            continue
        else:
            movie_links.append(row.select('td i a')[0]['href'])
    for movie in movie_links:
        link = 'https://en.wikipedia.org' + movie
        soup = bs(requests.get(link).content)
        Movies.append(dictionarize(soup))

Salviamo i dati in un file json

In [10]:
import json

def save_data(title, data):
    with open(title, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

def load_data(title):
    with open(title, encoding='utf-8') as f:
        return(json.load(f))

In [222]:
save_data('Disney_data.json', Movies)

**3. Pulizia dati**

- Ci sbarazziamo dei riferimenti a piè di pagina (come [1], [2]):

In [302]:
for movie in Movies:
    for key, value in movie.copy().items():
        if value is None:
            del movie[key]

In [393]:
import re

def remove_tags(string):
    return re.sub(r'\[.\]', '',string).strip()
for movie in Movies:
    for key, value in list(movie.items()):
        if type(value) is str:
            movie[key] = remove_tags(value)
        elif type(value) is list:
            movie[key] = [remove_tags(item) for item in value]

- Togliamo la data tra parentesi:

In [348]:
def drop_date(string):
    return(re.split(r'[(|)]',string,2)[0] + re.split(r'[(|)]',string,2)[-1].strip())

In [391]:
for movie in Movies:
    for key, value in list(movie.items()):
        if (key == 'Release date' or key == 'Release dates'):
            if type(value) is str:
                if value.count('(') >= 1:
                    movie[key] = drop_date(value).strip()
            if type(value) is list:
                movie[key] = [drop_date(item).strip() for item in value if item.count('(') >=1]
    

In [395]:
save_data('Disney_data.json', Movies)

- Trasformiamo la durata in int:

In [470]:
for i, movie in enumerate(Movies):
    if 'Running time' in list(movie.keys()):
        if type(movie['Running time']) is str:
            temp = int(re.split(r'\D', movie['Running time'])[0])
            movie['Running time'] = temp
        elif type(movie['Running time']) is list:
            temp = int(re.split(r'\D', movie['Running time'][0])[0])
            movie['Running time'] = temp

In [475]:
save_data('Disney_data.json', Movies)

- Trasformiamo Bugdet e Box office in int o float:

**Budget**

In [66]:
Movies = load_data('Disney_data.json')

In [499]:
for i, movie in enumerate(Movies):
    if 'Budget' in list(movie.keys()) and movie['Budget'][0] != '$':
        print(i, movie['Budget'])

40 under $1 million  or $1,250,000
122 AU$1 million
164 A$8.7 million
172 60 million Norwegian Kroner (around $8.7 million in 1989)
316 ¥ 2.4 billion US$24 million
370 ¥ 3.4 billion ( US$ 34 million)
390 ₹26 crore
397 ['$410.6 million (gross)', '$378.5 million (net)']
404 ['$306.6 million (gross)', '$263.7 million (net)']
427 ₹ 23 crore (US$2.9million)
437 ₹50 crore
449 ₹ 70 crore (US$8.8million)
456 ₹ 131 crore
457 ~$8 million ₽ 370 million
489 ₽650 million
497 ₽454 million
504 ~$70 million [nb 1]


In [67]:
Movies[40]['Budget'] = 1250000
Movies[122]['Budget'] = 650000
Movies[164]['Budget'] = 5630000
Movies[172]['Budget'] = 8700000
Movies[316]['Budget'] = 24000000
Movies[370]['Budget'] = 34000000
Movies[390]['Budget'] = 3130000
Movies[397]['Budget'] = 378500000
Movies[404]['Budget'] = 263700000
Movies[427]['Budget'] = 2900000
Movies[437]['Budget'] = 6020000
Movies[449]['Budget'] = 8800000
Movies[456]['Budget'] = 15800000
Movies[457]['Budget'] = 8000000
Movies[489]['Budget'] = 7140000
Movies[497]['Budget'] = 4990000
Movies[504]['Budget'] = 70000000

In [68]:
for i, movie in enumerate(Movies):
    if 'Budget' in list(movie.keys()):
        if type(movie['Budget']) is str and 'million' in movie['Budget']:
            temp = float(re.split(r'\D',movie['Budget'][1:])[0])+float('0.'+re.split(r'\D',movie['Budget'][1:])[1])
            movie['Budget'] = round(temp*1000000)

In [78]:
for i, movie in enumerate(Movies):
    if 'Budget' in list(movie.keys()) and type(movie['Budget']) is str:
        movie['Budget'] = int(re.sub(r'\D','',movie['Budget']))

In [79]:
save_data('Disney_data.json', Movies)

**Box Office**

In [91]:
Movies = load_data('Disney_data.json')

In [83]:
for i, movie in enumerate(Movies):
    if 'Box office' in list(movie.keys()) and movie['Box office'][0] != '$':
        print(i, movie['Box office'])

4 >$1.3 million (est. United States/Canada rentals, 1941)
17 ['$2.4 million (1951, domestic)', '$3.5 million (1974, domestic)']
41 Original release : $2.6 million (est. US/ Canada rentals)  1969 re-release : $2.3 million  (US/ Canada rentals)
68 est. $1,600,000 (US/ Canada)
71 est. $3,500,000 (US/ Canada)
165 >$121 million
316 ¥ 23.2 billion US$236 million (worldwide)
344 3 130 137
370 US$ 204.8 million
427 est. ₹ 79.43 crore (US$9.9million)
437 est. ₹167crore
449 ₹2,024 crore ( US$340 million )
456 ₹ 86.85 crore
489 ['₽2.046 billion', '$28.3 million (Worldwide)', '$28.6 million (Russia/ CIS )']
497 ['₽2.196 billion', '$27 million']


In [93]:
Movies[4]['Box office'] = Movies[4]['Box office'][1:]
Movies[17]['Box office'] = '$2.4 million' 
Movies[41]['Box office'] = '$2.6 million'
Movies[68]['Box office'] = '$1,600,000'
Movies[71]['Box office'] = '$3,500,000'
Movies[165]['Box office'] = Movies[165]['Box office'][1:]
Movies[316]['Box office'] = '$236 million'
Movies[344]['Box office'] = '$3130137'
Movies[370]['Box office'] = '$204.8 million'
Movies[427]['Box office'] = '$9.9 million'
Movies[437]['Box office'] = '$20.1 million'
Movies[449]['Box office'] = '$340 million'
Movies[456]['Box office'] = '$10.36 million'
Movies[489]['Box office'] = '$28.3 million'
Movies[497]['Box office'] = '$27 million'

In [95]:
for i, movie in enumerate(Movies):
    try:
        if 'Box office' in list(movie.keys()):
            if type(movie['Box office']) is str and 'million' in movie['Box office']:
                temp = float(re.split(r'\D',movie['Box office'][1:])[0])+float('0.'+re.split(r'\D',movie['Box office'][1:])[1])
                movie['Box office'] = round(temp*1000000)
    except Exception as e:
        print(i,e)

In [99]:
for i, movie in enumerate(Movies):
    try:
        if 'Box office' in list(movie.keys()):
            if type(movie['Box office']) is str and 'billion' in movie['Box office']:
                temp = float(re.split(r'\D',movie['Box office'][1:])[0])+float('0.'+re.split(r'\D',movie['Box office'][1:])[1])
                movie['Box office'] = round(temp*1000000000)
    except Exception as e:
        print(i,e)

In [100]:
save_data('Disney_data.json', Movies)