## Scrape info box of Toy story

## Scrape info box of every Disney movie

In [8]:
from bs4 import BeautifulSoup as bs
import requests
import json
# python -m pip install "pymongo[srv]"==3.11

In [19]:
import re

def clean_tag(soup: bs):
    for tag in soup.find_all(['sup', 'b', 'span', 'br']):
        tag.decompose()


def clean_parenthesis(text: str) -> str:
    pattern = r'\s*\([^)]*\)'
    cleaned_text = re.sub(pattern, '', text)
    return cleaned_text      

        
def scrape_info_box(url: str) -> dict:
    movie_info = {}
    r = requests.get(url)
    soup = bs(r.content, 'html.parser')
    info_box = soup.find(class_='infobox vevent')

    movie_info['Title'] = info_box.find(class_='infobox-above').string
    movie_info['Poster'] = 'https:' + info_box.find(class_='infobox-image').find('img')['src']
    clean_tag(info_box)
    theads = info_box.find_all('th', {'class': 'infobox-label'})
    tdata = info_box.find_all('td', {'class': 'infobox-data'})

    for label, data in zip(theads, tdata):
        label_text = label.get_text(' ', strip=True).replace('\xa0', ' ')

        if label_text == 'Directed by':
           clean_tag(data)
           movie_info[label_text] = [re.sub(r'\s*\(.*?\)\s*', '', a['title']) for a in data.find_all("a")]
           movie_info['Directors url'] = [a['href'] for a in data.find_all("a")]
        elif label_text == 'Based on':
            data_text = data.get_text(' ', strip=True).replace('\xa0', ' ')
            movie_info[label_text] = clean_parenthesis(data_text)
        elif label_text == 'Distributed by':
            data_text = data.get_text(' ', strip=True).replace('\xa0', ' ')
            movie_info[label_text] = clean_parenthesis(data_text)
        elif label_text == 'Release dates':
            data_text = data.get_text('-', strip=True).replace('\xa0', ' ')
            movie_info[label_text] = data_text.split('-')[0]
        else:
            movie_info[label_text] = clean_parenthesis(data.get_text(', ', strip=True).replace('\xa0', ' '))
        
    return  movie_info

# https://en.wikipedia.org/wiki/The_Reluctant_Dragon_(1941_film)

In [20]:
print(scrape_info_box('https://en.wikipedia.org/wiki/The_Reluctant_Dragon_(1941_film)'))

{'Title': 'The Reluctant Dragon', 'Poster': 'https://upload.wikimedia.org/wikipedia/en/thumb/f/fe/Reluctant_Dragon.jpg/220px-Reluctant_Dragon.jpg', 'Directed by': ['Alfred Werker', 'Hamilton Luske', 'Jack Cutting', 'Ub Iwerks', 'Jack Kinney'], 'Directors url': ['/wiki/Alfred_Werker', '/wiki/Hamilton_Luske', '/wiki/Jack_Cutting_(animator)', '/wiki/Ub_Iwerks', '/wiki/Jack_Kinney'], 'Written by': 'Live-action:, Ted Sears, Al Perkins, Larry Clemmons, Bill Cottrell, Harry Clork, Robert Benchley, The Reluctant Dragon, segment:, Kenneth Grahame,, Erdman Penner, T. Hee, Baby Weems, segment:, Joe Grant, Dick Huemer, John Miller', 'Produced by': 'Walt Disney', 'Starring': 'Robert Benchley, Frances Gifford, Buddy Pepper, Nana Bryant', 'Cinematography': 'Bert Glennon', 'Edited by': 'Paul Weatherwax', 'Music by': 'Frank Churchill, Larry Morey', 'Production company': 'Walt Disney Productions', 'Distributed by': 'RKO Radio Pictures', 'Release date': 'June 27, 1941', 'Running time': '74 minutes', 'Cou

In [21]:
response = requests.get('https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films')
soup = bs(response.content)
eras = soup.find_all('h3')
# era_tables = soup.find_all('table', attrs={'class':'wikitable sortable'})
# movies = soup.select('.wikitable.sortable i')

base_path = 'https://en.wikipedia.org'
movie_info_list = []
for era in eras[:-2]:
    table = era.find_next_sibling('table', class_='wikitable sortable')
    for movie in table.select('i'):
        try:
            print(f"Scraping info of movie {movie.a['title']}")
            full_path = base_path +  movie.a['href']
            movie_info_list.append(scrape_info_box(full_path))
        except Exception as e:
            print(e)     

Scraping info of movie Snow White and the Seven Dwarfs (1937 film)
Scraping info of movie Pinocchio (1940 film)
Scraping info of movie Fantasia (1940 film)
Scraping info of movie The Reluctant Dragon (1941 film)
Scraping info of movie Dumbo
Scraping info of movie Bambi
Scraping info of movie Saludos Amigos
Scraping info of movie Victory Through Air Power (film)
Scraping info of movie The Three Caballeros
Scraping info of movie Make Mine Music
Scraping info of movie Song of the South
Scraping info of movie Fun and Fancy Free
Scraping info of movie Melody Time
Scraping info of movie So Dear to My Heart
Scraping info of movie The Adventures of Ichabod and Mr. Toad
Scraping info of movie Cinderella (1950 film)
Scraping info of movie Treasure Island (1950 film)
Scraping info of movie Alice in Wonderland (1951 film)
Scraping info of movie The Story of Robin Hood (film)
Scraping info of movie Peter Pan (1953 film)
Scraping info of movie The Sword and the Rose
Scraping info of movie The Living

In [39]:
movie_info_list[-1]

{'Title': 'Diary of a Wimpy Kid Christmas: Cabin Fever',
 'Poster': 'https://upload.wikimedia.org/wikipedia/en/thumb/c/c0/Diary_of_a_Wimpy_Kid_Christmas_Cabin_Fever_poster.jpg/220px-Diary_of_a_Wimpy_Kid_Christmas_Cabin_Fever_poster.jpg',
 'Directed by': [],
 'Directors url': [],
 'Screenplay by': 'Jeff Kinney',
 'Based on': 'Diary of a Wimpy Kid: Cabin Fever by Jeff Kinney',
 'Produced by': 'Jeff Kinney',
 'Starring': 'Wesley Kimmel, Spencer Howell, Erica Cerra, Hunter Dillon, Chris Diamantopoulos',
 'Music by': 'John Paesano',
 'Production companies': 'Walt Disney Pictures, Bardel Entertainment',
 'Distributed by': 'Disney+',
 'Release date': 'December 8, 2023',
 'Running time': '62 minutes',
 'Countries': 'United States, Canada',
 'Language': 'English'}

## Save and reload movie data

In [9]:
def save_json(title, data):
    with open(title, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
        
def load_json(title):
    with open(title, encoding='utf-8') as f:
        return json.loads(f.read())

In [31]:
save_json('movies.json', movie_info_list)

In [46]:
movie_info_list = load_json('movies.json')

### Get directors's info

In [32]:
def get_director_info(director:str, rel_url:str):
    director_info = {}
    base_url = "https://en.wikipedia.org"
    full_url = base_url + rel_url
    r = requests.get(full_url)
    soup = bs(r.content, 'html.parser')
    info_box = soup.find('table', {'class': 'biography'})

    print(director)
    director_info['Director Name'] = director   
    if info_box:
        theads = info_box.find_all('th', {'class': 'infobox-label'})
        tdata = info_box.find_all('td', {'class': 'infobox-data'})
        for label, data in zip(theads, tdata):
            label_text = label.get_text(' ', strip=True).replace('\xa0', ' ')
            if label_text == 'Born':
                director_info[label_text] = born.string if (born:= data.find('span', {'class': 'bday'})) else 'Unknown'
            elif label_text == "Occupations":
                director_info[label_text] = data.get_text(', ', strip=True).replace('\xa0', ' ')
            elif label_text == "Years active":
                director_info[label_text] = data.get_text(', ', strip=True).replace('\xa0', ' ')
            elif label_text == "Spouse":
                director_info[label_text] = data.get_text(', ', strip=True).replace('\xa0', ' ').split(', ')[0]
            elif label_text == "Awards":
                for tag in data.find_all(['i']):
                    tag.decompose()
                director_info[label_text] = data.get_text(', ', strip=True).replace('\xa0', ' ')
        
    else:
        director_info.update({key: 'Unknown' for key in ['Born', 'Died', 'Occupations', "Years active", "Spouse", "Awards"]})

    return director_info

director_info_list = []

director_url_set = {(director, url) for movie in movie_info_list for director, url in zip(movie.get('Directed by', []), movie.get('Directors url', []))}
# 316 unique        
print(len(director_url_set))

for director, url in director_url_set:
    director_info_list.append(get_director_info(director=director, rel_url=url))




316
George Butler
Reginald Hudlin
Brenda Chapman
Craig Gillespie
Brian Fee
Donovan Cook
Swinton O. Scott III
Hamilton Luske
Andy Knight
Fletcher Markle
Lasse Hallström
Andrew Adamson
Kenneth Branagh
Gore Verbinski
John Pasquin
Chris Buck
Rob Minkoff
Gary Trousdale
Josh Cooley
Hayao Miyazaki
Peter Sohn
Robert Butler
Paul and Gaëtan Brizzi
Barry Cook
Sara Sugarman
Nitesh Tiwari
Pete Docter
Brian Henson
Hiromasa Yonebayashi
Rick Calabash
Bob Spiers
Charlie Bean
Mark Waters
Ford Beebe
Nick Marck
Bruce Bilson
Don Chaffey
Tim Hunter
Nancy Meyers
Tim Story
Geoff Burrowes
Robert Altman
Timothy Björklund
Victor Cook
Andrew Davis
Mira Nair
Matthew O'Callaghan
Bill Paxton
Joe Camp
Larry Morey
Ron Underwood
George Scribner
Keith Scholey
Jim Fall
Rob Marshall
James Neilson
Ericson Core
Steve Miner
Julia Hart
Lewis R. Foster
Norm Ferguson
Domee Shi
Michael O'Herlihy
Alfred Werker
Steven Hilliard Stern
Jugal Hansraj
Ralph Zondag
Stanley Tong
Bruce Hendricks
Russ Mayberry
Fawn Veerasunthorn
Gary Nelso

In [33]:
save_json('directors.json', director_info_list)

## cleaning data  

In [None]:
# budget money pattern '$1.5 million' '$2.28 million' '$600,000', None, under $1 million $1,250,000 $3.6–4 million AU$1 million
# $3.5 to 4 million $6-8 million $40–120 million ¥, 2.4 billion, US$24 million $30—$35 million' '' 
# ~$8 million, ₽, 370 million
# ~$70 million $130.4, million $35 thousand

In [None]:
# box office money with new pattern $76.4–$83.3 million >$1.3 million $3.275 million
# $21,745,500 $1.066 billion $146, million 

In [49]:
def convert_running_time_to_int(x):
    if x == 'N/A':
        return 'N/A'
    else:
        minutes = x.split(' ')[0]
        return int(minutes)

In [50]:
for movie in movie_info_list:
    runtime = movie.get('Running time', "N/A")
    movie["Running time (int)"] = convert_running_time_to_int(runtime)
    if runtime != "N/A":
        del movie["Running time"]

In [51]:
import re

amounts = r"thousand|million|billion"
number = r"\d+(,\d{3})*(\.\d+)?"
money_string = fr"\${number}(–|\sto\s|–\$)?({number})?(\s|,\s)({amounts})"

def word_to_value(word):
	value_dict = {"thousand": 1000, "million": 1000_000, "billion": 1000_000_000}
	return value_dict.get(word.lower(), 1)

def parse_word_syntax(string):
	stripped_string = string.replace(",", "")
	value = float(re.search(number, stripped_string).group())
	modifier = word_to_value(re.search(amounts, string, flags=re.I).group())
	return value*modifier

def parse_value_syntax(string):
	stripped_string = string.replace(",", "")
	return float(re.search(number, stripped_string).group())

def money_converter(money):
	string_syntax = re.search(money_string, money, flags=re.I)
	value_syntax = re.search(fr"\${number}", money)

	if string_syntax:
		return parse_word_syntax(string_syntax.group())
	elif value_syntax:
		return parse_value_syntax(value_syntax.group())
	else:
		return ""

In [52]:
for movie in movie_info_list:
    budget = movie.get('Budget', 'N/A')
    box_office = movie.get('Box office', 'N/A')
    movie["Budget (float)"] = money_converter(budget)
    movie["Box office (float)"] = money_converter(box_office)

    if budget != "N/A":
        del movie["Budget"]
    if box_office != "N/A":
        del movie['Box office']

In [53]:
movie_info_list

[{'Title': 'Snow White and the Seven Dwarfs',
  'Poster': 'https://upload.wikimedia.org/wikipedia/en/thumb/4/49/Snow_White_1937_poster.png/220px-Snow_White_1937_poster.png',
  'Directed by': ['David Hand',
   'Perce Pearce',
   'Larry Morey',
   'Wilfred Jackson',
   'Ben Sharpsteen'],
  'Directors url': ['/wiki/David_Hand_(animator)',
   '/wiki/Perce_Pearce',
   '/wiki/Larry_Morey',
   '/wiki/Wilfred_Jackson',
   '/wiki/Ben_Sharpsteen'],
  'Story by': 'Ted Sears, Richard Creedon, Otto Englander, Dick Rickard, Earl Hurd, Merrill De Maris, Dorothy Ann Blank, Webb Smith',
  'Based on': '" Snow White " by the Brothers Grimm',
  'Produced by': 'Walt Disney',
  'Starring': 'Adriana Caselotti, Roy Atwell, Pinto Colvig, Otis Harlan, Scotty Mattraw, Billy Gilbert, Eddie Collins',
  'Music by': 'Frank Churchill, Leigh Harline, Paul Smith',
  'Production company': 'Walt Disney Productions',
  'Distributed by': 'RKO Radio Pictures',
  'Release dates': 'December 21, 1937',
  'Country': 'United Sta

#### removes comma from date string

In [47]:
for movie in movie_info_list:
    date_str = movie.get('Release date', "")
    if date_str.endswith(","):
        movie['Release date'] = date_str[:-1]



In [45]:
print([movie.get('Release date') for movie in movie_info_list if movie.get('Release date', "").startswith('2')])


['2009', '22 April 2011', '25 May 2012', '23 December 2021']


In [236]:
# new_dataset = []
# for movie in dataset:
#     movie = {k.replace("Release dates", "Release date"): v for k, v in movie.items()}
#     movie = {k.replace("Original release", "Release date"): v for k, v in movie.items()}
#     new_dataset.append(movie)

In [48]:
from datetime import datetime
def reformat_date(date_str):
    # Convert string to date
    date_object = datetime.strptime(date_str, '%d %B %Y')

    # Convert date to the desired format
    formatted_date = date_object.strftime('%B %d, %Y')
    return formatted_date

for movie in movie_info_list:
    date_str = movie.get('Release date', "")
    if date_str.startswith('1') or date_str.startswith('2'):
        movie['Release date']  = reformat_date(date_str=date_str)




In [54]:
save_json('cleaned_movies.json', movie_info_list)

In [252]:
# import datetime
# pattern1 = r'(\w+)\s(\d{1,2}),?\s(\d{4})'
# pattern2 = r'(\d{1,2})\s(\w+),?\s(\d{4})'
# pattern_code = {pattern1:[1, 2, 3], pattern2:[2, 1, 3]}

# def convert_to_datetime(date_text, code):
#     cleaned_date_text = f'{date_text.group(code[0])} {int(date_text.group(code[1])):02} {date_text.group(code[2])}'
#     return datetime.datetime.strptime(cleaned_date_text, '%B %d %Y').date()

# def get_date_text(x):
#     if x == 'N/A' or x==[]:
#         return 'N/A'
#     if isinstance(x, list):
# #         date_text = None
#         for pattern in pattern_code.keys():
#             date_text = re.search(pattern, x[0], flags=re.I)
#             if date_text:
#                 return convert_to_datetime(date_text, pattern_code[pattern])
                
#     if isinstance(x,str):
#         for pattern in pattern_code.keys():
#             date_text = re.search(pattern, x, flags=re.I)
#             if date_text:
#                 return convert_to_datetime(date_text, pattern_code[pattern])
        
    

In [254]:
# for movie in new_dataset:
#     movie['Release date (datetime)'] = get_date_text(movie.get('Release date', 'N/A'))