### Scraping data from wikipedia

In [14]:
# Importing necessary libraries
from bs4 import BeautifulSoup as bs
import requests

BASE_URL = "https://en.wikipedia.org"

In [3]:
import pickle
# Save with pickle
# Save data in a file
def save_data_pickle(title, data):
    with open(title, "wb") as fp:
        pickle.dump(data, fp)

# Load data from file
def load_data_pickle(title):
    with open(title, "rb") as fp:
        return pickle.load(fp)

In [4]:
def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0"," ") for li in row_data.find_all("li")]
    elif row_data.find("br"):
        return [text for text in row_data.stripped_strings]
    else:
        return row_data.get_text(" ",strip = True).replace("\xa0"," ")

# get all information from the info box in wikipedia
def get_info_box(url):
    r = requests.get(url)
    soup = bs(r.content)
    info_box = soup.find(class_="infobox vevent")
    info_rows = info_box.find_all("tr")
    
    clean_tags(soup)
    
    movie_info = {}
    for index, row in enumerate(info_rows):
        if index == 0:
            movie_info["title"] = row.find("th").get_text(" ", strip=True)
        else:
            if row.find("th"):
                content_key = row.find("th").get_text(" ", strip=True)
                content_value =  get_content_value(row.find("td"))
                movie_info[content_key] = content_value
    return movie_info

# clean references and extra dates
def clean_tags(soup):
    for tag in soup.find_all(["sup", "span"]):
        tag.decompose()

In [5]:
url_list = []
def get_urls():
  r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")
  soup = bs(r.content)
  movies = soup.select(".wikitable.sortable i a")
  for index,movie in enumerate(movies):
      try:
        relative_path = movie['href']
        url_list.append(relative_path)
      except Exception as e:
        print(e)
get_urls()
url_chunks = [url_list[x:x+100] for x in range(0, len(url_list), 100)]

In [6]:
import time
import threading
movies_data = []

def get_movies_info(uri_list):
  for urls in uri_list:
    try:
      movies_data.append(get_info_box(BASE_URL+urls))
    except Exception as e:
      print(f"Error in {urls}")

t1 = threading.Thread(target=get_movies_info, args=([url_chunks[0]]))
t2 = threading.Thread(target=get_movies_info, args=([url_chunks[1]]))
t3 = threading.Thread(target=get_movies_info, args=([url_chunks[2]]))
t4 = threading.Thread(target=get_movies_info, args=([url_chunks[3]]))
t5 = threading.Thread(target=get_movies_info, args=([url_chunks[4]]))

t1.start()
t2.start()
t3.start()
t4.start()
t5.start()

t1.join()
t2.join()
t3.join()
t4.join()
t5.join()

Error in /wiki/True-Life_Adventures
Error in /wiki/The_Omega_Connection
Error in /wiki/Zorro_(1957_TV_series)#Theatrical
Error in /wiki/Zorro_(1957_TV_series)#Theatrical
Error in /wiki/Tim_Federle#Fiction


#### Reformat Data

In [7]:
# Get numerical value from 'Running time' String
def minute_to_integer(running_time):
  if running_time == "N/A":
    return None
  if isinstance(running_time, list):
    return running_time[0].split(" ")[0]
  else:
    return int(running_time.split(" ")[0])

for movie in movies_data:
  movie["Running time (int)"] = minute_to_integer(movie.get("Running time", "N/A"))

In [8]:
# Convert budget box and box office to floats value eg: $79.9 million to 79900000
# with regex
import re

amounts = r"thousand|million|billion"
number = r"\d+(,\d{3})*\.*\d*"
standard = fr"\${number}(-|\sto\s)?({number})?\s({amounts})"

def word_to_value(word):
	value_dict = {"thousand": 1000, "million": 1000000, "billion": 1000000000}
	return value_dict.get(word.lower(), 1)

def parse_word_syntax(string):
	stripped_string = string.replace(",", "")
	value = float(re.search(number, stripped_string).group())
	modifier = word_to_value(re.search(amounts, string, flags=re.I).group())
	return value*modifier

def parse_value_syntax(string):
	stripped_string = string.replace(",", "")
	return float(re.search(number, stripped_string).group())

def money_conversion(money):
  if money == "N/A":
    return None

  if type(money) == list:
	  money = money[0]

  word_syntax = re.search(standard, money, flags=re.I)
  value_syntax = re.search(fr"\${number}", money)

  if word_syntax:
    return parse_word_syntax(word_syntax.group())
  elif value_syntax:
    return parse_value_syntax(value_syntax.group())
  else:
    return None

for movie in movies_data:
  movie["Budget (float)"] = money_conversion(movie.get("Budget", "N/A"))
  movie["Box Office (float)"] = money_conversion(movie.get("Box office", "N/A"))

In [9]:
# Convert Dates into datetimes
from datetime import datetime

dates = [movie.get("Release date", "N/A") for movie in movies_data]

def clean_date(date):
  return date.split("(")[0].strip()

def date_conversion(date):
  if type(date) == list:
    date = date[0]
  if date == "N/A":
    return None

  date_str = clean_date(date)
  fmts = ["%B %d, %Y", "d %B %Y"]
  for fmt in fmts:
    try:
      return datetime.strptime(date_str,fmt)
    except:
      pass
  return None
  
# Add new column for reformated date time
for movie in movies_data:
  movie["Release date (datetime)"] = date_conversion(movie.get("Release date", "N/A"))

In [21]:
# Add IMDB scores to data
import urllib
import os
from dotenv import load_dotenv
load_dotenv()

def get_omdb_info(title):
  URL = "http://www.omdbapi.com/?"
  parameters = {"t": title,"apikey": os.environ["api_key"],}
  params_encoded = urllib.parse.urlencode(parameters)
  full_url = URL + params_encoded
  return requests.get(full_url).json()

def get_rotten_tomato_score(omdb_info):
  ratings = omdb_info.get("Ratings", [])
  for r in ratings:
    if r["Source"] == "Rotten Tomatoes":
      return r["Value"]
  return None

for movie in movies_data:
    title = movie["title"]
    omdb_info = get_omdb_info(title)
    movie["imdb"] = omdb_info.get("imdbRating", None)
    movie["metascore"] = omdb_info.get("Metascore", None)
    movie["rotten_tomatoes"] = get_rotten_tomato_score(omdb_info)

In [22]:
# stringify datetime to store in json
finalized_movie = [movie.copy() for movie in movies_data]
for movie in finalized_movie:
  current_date = movie["Release date (datetime)"]
  if current_date:
    movie["Release date (datetime)"] = current_date.strftime("%B %d, %Y")
  else:
    movie["Release date (datetime)"] = None

In [23]:
import json

# Save data in a file
def save_data(title, data):
    with open(title, "w", encoding="utf-8") as fp:
        json.dump(data, fp, ensure_ascii=False, indent=2)

# Load data from file
def load_data(title):
    with open(title, "r",encoding="utf-8") as fp:
        return json.load(fp)

In [24]:
# Save in json
save_data("movies_datasets_final.json", finalized_movie)

In [25]:
import pandas as pd

df = pd.DataFrame(movies_data)
df.to_csv("movie_dataset.csv")

### Movie Dataset

In [26]:
# Load reformed Data
movie_info_datasets = load_data("movies_datasets_final.json")

In [30]:
movie_info_datasets[1]

{'title': 'Academy Award Review of',
 'Production company': 'Walt Disney Productions',
 'Release date': ['May 19, 1937'],
 'Running time': '41 minutes (74 minutes 1966 release)',
 'Country': 'United States',
 'Language': 'English',
 'Box office': '$45.472',
 'Running time (int)': 41,
 'Budget (float)': None,
 'Box Office (float)': 45.472,
 'Release date (datetime)': 'May 19, 1937',
 'imdb': '7.1',
 'metascore': 'N/A',
 'rotten_tomatoes': None}

In [29]:
df = pd.read_csv("movie_dataset.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,title,Directed by,Produced by,Written by,Narrated by,Music by,Cinematography,Edited by,Production companies,...,Starring,Adaptation by,Animation by,Traditional,Simplified,Countries,Languages,Japanese,Hepburn,Color process
0,0,March of the Penguins,Luc Jacquet,"['Yves Darondeau', 'Christophe Lioud', 'Emmanu...","['Luc Jacquet', 'Michel Fessler', 'Jordan Robe...","['Amitabh Bachchan (Hindi)', 'Charles Berling ...","['Émilie Simon (France)', 'Alex Wurman (US)']","['Laurent Chalet', 'Jérôme Maison']",Sabine Emiliani,"['Wild Bunch', 'National Geographic Films', 'B...",...,,,,,,,,,,
1,1,Academy Award Review of,,,,,,,,,...,,,,,,,,,,
2,2,Rudyard Kipling's The Jungle Book,Stephen Sommers,"['Edward S. Feldman', 'Raju Patel']",,,Basil Poledouris,Juan Ruiz Anchía,Bob Ducsay,"['Baloo Productions', 'Jungle Book Films', 'Wa...",...,"['Jason Scott Lee', 'Cary Elwes', 'Lena Headey...",,,,,,,,,
3,3,The Aristocats,Wolfgang Reitherman,"['Winston Hibler', 'Wolfgang Reitherman']",,,George Bruns,,Tom Acosta,,...,"['Phil Harris', 'Eva Gabor', 'Sterling Hollowa...",,,,,,,,,
4,4,Pirates of the Caribbean: Dead Man's Chest,Gore Verbinski,Jerry Bruckheimer,"['Ted Elliott', 'Terry Rossio']",,Hans Zimmer,Dariusz Wolski,"['Craig Wood', 'Stephen Rivkin']","['Walt Disney Pictures', 'Jerry Bruckheimer Fi...",...,"['Johnny Depp', 'Orlando Bloom', 'Keira Knight...",,,,,,,,,
