In [1]:
import requests, re, html, csv
from bs4 import BeautifulSoup


In [2]:
# Function to search for a movie by title
def search_movie_by_title(title):
    search_url = f'http://www.boxofficemojo.com/search/?q={title.replace(" ", "+")}'
    response = requests.get(search_url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        # Find the first search result link
        result_link = soup.find('a', {'class': 'a-size-medium a-link-normal a-text-bold'})
        print("results: ", result_link)
        if result_link:
            movie_url = result_link['href']
            return movie_url
        else:
            return None
    else:
        print(f'Failed to retrieve search results. Status Code: {response.status_code}')
        return None

def extract_values_from_html_array(html_array):
    pattern = r'<span class="money">\$([0-9,]+)</span>'
    values = []

    for item in html_array:
        match = re.search(pattern, str(item))
        if match:
            value = match.group(1)
            value = int(value.replace(',', ''))
            values.append(value)
    return values


# Function to scrape box office revenue and budget from a movie's page
def scrape_movie_data(movie_url):
    print("movie url::", 'https://www.boxofficemojo.com' + movie_url)
    response = requests.get('https://www.boxofficemojo.com' + movie_url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        revenue_element = soup.find('div', {'class': 'a-section a-spacing-none mojo-performance-summary-table'})
        budget_element = soup.find('div', {'class': 'a-section a-spacing-none mojo-summary-values mojo-hidden-from-mobile'})


        if revenue_element and budget_element:
            revenue = revenue_element.find_all('span', {'class': 'money'})
            budget = budget_element.find_all('span', {'class': 'money'})

            return revenue, budget
        else:
            return None, None
    else:
        print(f'Failed to retrieve data. Status Code: {response.status_code}')
        return None, None


def revenue_budget(title):
  print("Title: ", title)
  movie_url = search_movie_by_title(title)
  print(movie_url)
  if movie_url:
      revenue, budget = scrape_movie_data( movie_url)

      if revenue:
          data = extract_values_from_html_array(revenue)
          if len(data) == 2:
            domestic, international, worldwide = data[0], 0, data[1]
          else:
            domestic, international, worldwide = data

          if budget:
            budget = extract_values_from_html_array(budget)
            budget = 0 if len(budget) == 1 else budget[-1]
            print("Data: ", data)
          else:
            budget = 0

          return domestic, international, worldwide, budget
      else:
          print('Data not found.')
          return 0,0,0,0
  else:
      print('Movie not found in Box Office Mojo.')
      return 0,0,0,0




In [9]:
def append_movie_data_to_csv(input_file, output_file):
    # Open the input CSV file for reading and the output CSV file for writing
    with open(input_file, 'r') as csv_input, open(output_file, 'w', newline='') as csv_output:
        # Create a CSV reader and writer
        reader = csv.reader(csv_input)
        writer = csv.writer(csv_output)

        # Read the header row and add new columns to it
        header = next(reader)
        header.extend(["domestic_revenue", "international_revenue", "worldwide_revenue", "budget"])
        writer.writerow(header)

        # Iterate through the rows and append movie data
        for row in reader:
            movie_name = row[1]  # Assuming movie name is in the second column
            domestic_revenue, international_revenue, worldwide_revenue, budget = revenue_budget(movie_name)
            print("csv data: ", domestic_revenue, international_revenue, worldwide_revenue, budget)
            row.extend([domestic_revenue, international_revenue, worldwide_revenue, budget])
            writer.writerow(row)

# Example usage:
input_file = './sample_data/comb_rtdata3.csv'
output_file = './sample_data/output_with_movie_data.csv'

append_movie_data_to_csv(input_file, output_file)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
movie url:: https://www.boxofficemojo.com/title/tt8664988/?ref_=bo_se_r_1
csv data:  2794392 0 2794392 0
Title:  In Time
results:  <a class="a-size-medium a-link-normal a-text-bold" href="/title/tt1637688/?ref_=bo_se_r_1">In Time</a>
/title/tt1637688/?ref_=bo_se_r_1
movie url:: https://www.boxofficemojo.com/title/tt1637688/?ref_=bo_se_r_1
Data:  [37520095, 136410501, 173930596]
csv data:  37520095 136410501 173930596 40000000
Title:  Flinch
results:  <a class="a-size-medium a-link-normal a-text-bold" href="/title/tt3420504/?ref_=bo_se_r_1">Finch</a>
/title/tt3420504/?ref_=bo_se_r_1
movie url:: https://www.boxofficemojo.com/title/tt3420504/?ref_=bo_se_r_1
Data not found.
csv data:  0 0 0 0
Title:  Transamerica
results:  <a class="a-size-medium a-link-normal a-text-bold" href="/title/tt0407265/?ref_=bo_se_r_1">Transamerica</a>
/title/tt0407265/?ref_=bo_se_r_1
movie url:: https://www.boxofficemojo.com/title/tt0407265/?ref_=b

In [7]:
import pandas as pd
df = pd.DataFrame()
df.to_csv('./sample_data/output_with_movie_data.csv', index=False)