### Import Necessary libraries

In [None]:
from bs4 import BeautifulSoup as bs
import requests

### Load the webpage

In [None]:
r= requests.get('https://en.wikipedia.org/wiki/Toy_Story_3')
soup = bs(r.content)


## Task 1: Get Movie info for 1 Movie

In [None]:
info_box_content=soup.find(class_='infobox vevent')
# print(content.prettify())
info_row = info_box_content.find_all("tr")
# print(info_row)
for index,row in enumerate(info_row):
    print(index,row.prettify());

In [None]:
def get_content_info(row_data):
    if row_data.find('li'):
        return[li.get_text(' ', strip= True).replace('\xa0','') for li in row_data.find_all('li')]
    elif row_data.find('br'):
        return [text for text in row_data.stripped_strings]
    else:
        return row_data.get_text(' ', strip= True).replace('\xa0','')

In [None]:
movie_info = {}

for index,row in enumerate(info_row):
    if index ==0:
        movie_info['title']= row.find('th').get_text(' ', strip= True)
    elif index ==1:
        continue
    else:
        content_key = row.find('th').get_text(' ', strip= True)
        content_value = get_content_info(row.find('td'))
        movie_info[content_key]=content_value
movie_info



## Task 2: Get info content for all the movies

In [None]:
def clean_tag(soup):
    for tag in soup.find_all(['sup','span']):
        tag.decompose()

In [None]:
def get_movie_info (url):
        r= requests.get(url)
        soup = bs(r.content)
        info_box_content=soup.find(class_='infobox vevent')
        # print(content.prettify())
        info_row = info_box_content.find_all("tr")
        clean_tag(soup)
    
        movie_info = {}
        for index,row in enumerate(info_row):
            if index ==0:
                movie_info['title']= row.find('th').get_text(' ', strip= True)
            # Not adding this line because in some of the movie th is missing
            # elif index ==1:
            #     continue
            else:
                header = row.find('th')
                if header:
                    content_key = row.find('th').get_text(' ', strip= True)
                    content_value = get_content_info(row.find('td'))
                    movie_info[content_key]=content_value
        return movie_info



In [None]:
get_movie_info('https://en.wikipedia.org/wiki/So_Dear_to_My_Heart')

In [None]:
r= requests.get('https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films')
soup= bs(r.content)
movie_data = soup.select('.wikitable.sortable i a')
print(len(movie_data))
movie_list=[]
base_path ='https://en.wikipedia.org/'

for index, movies in enumerate(movie_data):
    try:
        url=movies['href']
        title = movies['title']
        full_path = base_path+ url
        movie_list.append(get_movie_info(full_path))
    except Exception as e:
        print(movies.get_text())
        print(e)
    

In [None]:
len(movie_list)

## Task 3: Save/Reload Movie Data

In [None]:
import json

def save_data(title, data):
    with open(title, 'w', encoding ='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

In [None]:
import json

def load_data(title):
    with open(title, encoding ='utf-8') as f:
        return json.load(f)

In [None]:
save_data('movie_metadata.json', movie_list)

## Task 4: Clean our data

In [None]:
movie_info = load_data('movie_metadata.json')

### Subtask
- Clean up reference [1]
- Convert data to datetime object
- Split up the long string
- Convert runtime into integer
- Conver Budget and Box-office collection to numbers

In [None]:
# Clean up the references([1] [2] etc)

In [None]:
# Split up the long strings

In [None]:
def clean_movie_Runtime (running_time):
    if running_time == 'N/A' or running_time is None:
        return None
    if isinstance(running_time, int):
        return running_time
    if isinstance(running_time, list):
        # entry = running_time[0]
        return int(running_time[0].split(" ")[0])
    if isinstance(running_time, str):
            # Extract only numeric characters
            numeric_chars = ''.join(char for char in running_time if char.isdigit())
            if numeric_chars:
                return int(numeric_chars)
            else:
                return None
    else:
        return int(running_time.split(" ")[0])

# print(clean_movie_Runtime("85 minutes"))

for movies in movie_info:
    movies['Running time (int)'] = clean_movie_Runtime(movies.get('Running time',"N/A"))

In [None]:
movie_info[-50]

In [None]:
[movie.get('Budget','N/A') for movie in movie_info]

In [None]:
import re

In [None]:
amounts = r"thousand|million|billion"
number = r"\d+(,\d{3})*\.*\d*"

word_re = rf"\${number}(-|\s to\s|–)?({number})?\s({amounts})"
value_re = rf"\${number}"

def word_to_value(word):
    value_dict = {"thousand":1000,"million":1000000,"billion":1000000000}
    return value_dict[word]

def parse_word_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(",",""))
    word = re.search(amounts, string, flags=re.I).group().lower()
    word_value = word_to_value(word)
    return value * word_value
    

def parse_value_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(",",""))
    return value

def money_conversion(money):
    if money =='N/A':
        return None

    if isinstance(money, list):
        money = money[0]
    word_syntax = re.search(word_re, money, flags=re.I)
    value_syntax = re.search(value_re, money)

    if word_syntax:
        return parse_word_syntax(word_syntax.group())

    elif value_syntax:
        return parse_value_syntax(value_syntax.group())
    else:
        return None
    
# # print(re.search(word_re,"$12.2 million").group())
# print(money_conversion('$790 Million'))

In [None]:
for movie in movie_info:
    movie['Budget (float)'] = money_conversion(movie.get('Budget','N/A'))
    movie['Box office (float)'] = money_conversion(movie.get('Box office','N/A'))

In [None]:
movie_info[-10]

In [None]:
[movie.get('Release date','N/A') for movie in movie_info]

In [None]:
from datetime import datetime

In [None]:
def clean_date()

In [None]:
def date_conversion(date):
    if isinstance(date, list):
        date = date[0]
    if date == 'N/A':
        return None
    date_str = clean_date(date)
    fmts = ["%B%d,%Y", "%d %B %Y", "%B%Y", "%d%B%Y"]
    for fmt in fmts:
        try:
            return datetime.strptime(date_str, fmt)
        except:
            continue
    return None

# Use the first available date
for movie in movie_info:
    release_date = movie.get('Release date', 'N/A')
    release_dates = movie.get('Release dates', 'N/A')
    
    # Try Release date first, then Release dates
    if release_date != 'N/A':
        movie['Release date (datetime)'] = date_conversion(release_date)
    elif release_dates != 'N/A':
        movie['Release date (datetime)'] = date_conversion(release_dates)
    else:
        movie['Release date (datetime)'] = None

In [None]:
movie_info[0:10]

In [None]:
import pickle

In [None]:
def save_json_data(name, data):
    with open(name,'wb') as f:
        pickle.dump(data,f)

In [None]:
def load_json_data(name):
    with open(name,'rb') as f:
        return pickle.load(f)

In [None]:
save_json_data('modified_cleaned_data.pickle', movie_info)

In [None]:
a= load_json_data('modified_cleaned_data.pickle')

In [None]:
a == movie_info

## Attach IMDB/Rotten Tomato/Metascore Rating

In [None]:
movie_info = load_json_data('modified_cleaned_data.pickle')

In [None]:
import urllib
def get_omdb_info(title):
    base_url = "http://www.omdbapi.com/?"
    parameter = {"apikey": "ff86d6db", "t": title} 
    params_encoded = urllib.parse.urlencode(parameter)
    full_url = base_url + params_encoded
    return requests.get(full_url).json()

info = get_omdb_info("into the woods")


def get_rotten_tomato_rating(omdb_info):
     ratings = omdb_info.get('Ratings',[])
     for rating in ratings:
         if (rating['Source'] == 'Rotten Tomatoes'):
             return rating['Value']
     return None

info = get_omdb_info("The Moon-Spinners")
print(info)
# get_rotten_tomato_rating(info)

In [None]:
for movie in movie_info:
    title = movie['title']
    omdb_info = get_omdb_info(title)
    movie['imdbRating'] = omdb_info.get('imdbRating',None)
    movie['metascore'] = omdb_info.get('Metascore', None)
    movie['rotten_tomato'] = get_rotten_tomato_rating(omdb_info)

In [None]:
movie_info[2]

In [None]:
movie_info_copy = [movie.copy() for movie in movie_info]

In [None]:
for movie in movie_info_copy:
    current_date = movie["Release date (datetime)"]
    if current_date:
        movie["Release date (datetime)"] = current_date.strftime("%B %d, %Y")
    else:
        movie["Release date (datetime)"] = None

In [None]:
save_json_data("final_movie_info_json", movie_info_copy)

## Convert to csv

In [None]:
import pandas as pd

df = pd.DataFrame(movie_info)

df.to_csv("final_movie_metadata.csv")