In [105]:
import requests
import csv
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

In [249]:
def movie_scrapper(url, m_results):
    results = requests.get(url)

    if results.status_code != 200:
        return f'status failed with {results.status_code}'
    else:
        # use BeautifulSoup to parse the contents of the request
        soup = BeautifulSoup(results.content, "html.parser")

        movie_results = soup.find(id="table")
        # print(movie_results)
        if movie_results:
            table = movie_results.find('table', class_="a-bordered a-horizontal-stripes a-size-base a-span12 mojo-body-table mojo-table-annotated")
            if table:
                rows = table.find_all("tr")[1:]
                for row in rows:
                    cells = row.find_all("td")
                    if len(cells) >= 7: #added check to make sure there are enough cells to prevent index errors.
                        row_data = {
                            "Movie_Rank": cells[0].text.strip(),
                            "Movie_Name": cells[1].text.strip(),
                            "Worldwide_Gross": cells[2].text.strip(),
                            "Domestic_Gross": cells[3].text.strip(),
                            "Domestic_Percent": cells[4].text.strip(),
                            "Foreign_Gross": cells[5].text.strip(),
                            "Foreign_Percent": cells[6].text.strip(),
                        }
                        # print(row_data)
                        m_results.append(row_data)
                    else:
                        print("Row has insufficient data.")
            else:
                print("Inner table not found.")
        else:
            print("Table element not found.")
                

In [259]:
target_url = 'https://www.boxofficemojo.com/year/world/'
years = ["2000/","2001/","2002/","2003/","2004/","2005/","2006/","2007/","2008/","2009/","2010/","2011/","2012/","2013/","2014/","2015/","2016/","2017/","2018/","2019/","2020/","2021/","2022/","2023/","2024/","2025/"]
m_res = []
for year in years:
    url = target_url + year
    # print(url)
    movie_scrapper(url,m_res)
df= pd.DataFrame(m_res)
print(df.head())
# print(m_res)

  Movie_Rank              Movie_Name Worldwide_Gross Domestic_Gross  \
0          1  Mission: Impossible II    $546,388,108   $215,409,889   
1          2               Gladiator    $460,583,960   $187,705,427   
2          3               Cast Away    $429,632,142   $233,632,142   
3          4         What Women Want    $374,111,707   $182,811,707   
4          5                Dinosaur    $349,822,765   $137,748,063   

  Domestic_Percent Foreign_Gross Foreign_Percent  
0            39.4%  $330,978,219           60.6%  
1            40.8%  $272,878,533           59.2%  
2            54.4%  $196,000,000           45.6%  
3            48.9%  $191,300,000           51.1%  
4            39.4%  $212,074,702           60.6%  


In [265]:
df.to_csv("movie_dataset_grossing.csv", index=False)

In [291]:
df = pd.read_csv("movie_dataset_grossing.csv")

In [293]:
df = df.replace('-',np.nan)

In [333]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5200 entries, 0 to 5199
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Movie_Rank        5200 non-null   int64  
 1   Movie_Name        5200 non-null   object 
 2   Worldwide_Gross   5200 non-null   object 
 3   Domestic_Gross    5200 non-null   float64
 4   Domestic_Percent  5200 non-null   float64
 5   Foreign_Gross     5200 non-null   float64
 6   Foreign_Percent   5148 non-null   object 
dtypes: float64(3), int64(1), object(3)
memory usage: 284.5+ KB


# Cleaning of values

Converting Domestic_Gross to digits and replacing missing values with mean

In [299]:
df['Domestic_Gross'] = pd.to_numeric(df['Domestic_Gross'].str.replace('$', '').str.replace(',', ''), errors='coerce')
# print(df['Domestic_Gross'])


0       215409889.0
1       187705427.0
2       233632142.0
3       182811707.0
4       137748063.0
           ...     
5195            NaN
5196            NaN
5197            NaN
5198            NaN
5199       230515.0
Name: Domestic_Gross, Length: 5200, dtype: float64


In [301]:
mean = df['Domestic_Gross'].mean()
print(mean)

58006871.413312696


In [303]:
df['Domestic_Gross'] = df['Domestic_Gross'].fillna(mean)
print(df['Domestic_Gross'])

0       2.154099e+08
1       1.877054e+08
2       2.336321e+08
3       1.828117e+08
4       1.377481e+08
            ...     
5195    5.800687e+07
5196    5.800687e+07
5197    5.800687e+07
5198    5.800687e+07
5199    2.305150e+05
Name: Domestic_Gross, Length: 5200, dtype: float64


In [306]:
df['Domestic_Gross'] = df['Domestic_Gross'].round(2)

In [308]:
print(df['Domestic_Gross'])

0       2.154099e+08
1       1.877054e+08
2       2.336321e+08
3       1.828117e+08
4       1.377481e+08
            ...     
5195    5.800687e+07
5196    5.800687e+07
5197    5.800687e+07
5198    5.800687e+07
5199    2.305150e+05
Name: Domestic_Gross, Length: 5200, dtype: float64


Converting Domestic_Percent to digits and replacing missing values with mean

In [312]:
df['Domestic_Percent'] = pd.to_numeric(df['Domestic_Percent'].str.replace('%', '').str.replace(',', ''), errors='coerce')
print(df['Domestic_Percent'])

0       39.4
1       40.8
2       54.4
3       48.9
4       39.4
        ... 
5195     NaN
5196     NaN
5197     NaN
5198     NaN
5199    74.4
Name: Domestic_Percent, Length: 5200, dtype: float64


In [314]:
mean = df['Domestic_Percent'].mean()
print(mean)

47.414158829676076


In [316]:
df['Domestic_Percent'] = df['Domestic_Percent'].fillna(mean)
print(df['Domestic_Percent'])

0       39.400000
1       40.800000
2       54.400000
3       48.900000
4       39.400000
          ...    
5195    47.414159
5196    47.414159
5197    47.414159
5198    47.414159
5199    74.400000
Name: Domestic_Percent, Length: 5200, dtype: float64


In [320]:
df['Domestic_Percent'] = df['Domestic_Percent'].round(2)
print(df['Domestic_Percent'])

0       39.40
1       40.80
2       54.40
3       48.90
4       39.40
        ...  
5195    47.41
5196    47.41
5197    47.41
5198    47.41
5199    74.40
Name: Domestic_Percent, Length: 5200, dtype: float64


Converting Foreign_Gross to digits and replacing missing values with mean

In [325]:
df['Foreign_Gross'] = pd.to_numeric(df['Foreign_Gross'].str.replace('$', '').str.replace(',', ''), errors='coerce')
print(df['Foreign_Gross'])

0       330978219.0
1       272878533.0
2       196000000.0
3       191300000.0
4       212074702.0
           ...     
5195       323655.0
5196       315475.0
5197       313178.0
5198       311335.0
5199        79257.0
Name: Foreign_Gross, Length: 5200, dtype: float64


In [327]:
mean = df['Foreign_Gross'].mean()
print(mean)

73930646.70532246


In [329]:
df['Foreign_Gross'] = df['Foreign_Gross'].fillna(mean)
print(df['Foreign_Gross'])

0       330978219.0
1       272878533.0
2       196000000.0
3       191300000.0
4       212074702.0
           ...     
5195       323655.0
5196       315475.0
5197       313178.0
5198       311335.0
5199        79257.0
Name: Foreign_Gross, Length: 5200, dtype: float64


In [331]:
df['Foreign_Gross'] = df['Foreign_Gross'].round(2)
print(df['Foreign_Gross'])

0       330978219.0
1       272878533.0
2       196000000.0
3       191300000.0
4       212074702.0
           ...     
5195       323655.0
5196       315475.0
5197       313178.0
5198       311335.0
5199        79257.0
Name: Foreign_Gross, Length: 5200, dtype: float64


Converting Foreign_Percentage to digits and replacing missing values with mean

In [336]:
df['Foreign_Percent'] = pd.to_numeric(df['Foreign_Percent'].str.replace('%', '').str.replace(',', ''), errors='coerce')

In [338]:
mean = df['Foreign_Percent'].mean()
print(mean)

65.87843518878941


In [340]:
df['Foreign_Percent'] = df['Foreign_Percent'].fillna(mean)
print(df['Foreign_Percent'])

0        60.6
1        59.2
2        45.6
3        51.1
4        60.6
        ...  
5195    100.0
5196    100.0
5197    100.0
5198    100.0
5199     25.6
Name: Foreign_Percent, Length: 5200, dtype: float64


In [342]:
df['Foreign_Percent'] = df['Foreign_Percent'].round(2)
print(df['Foreign_Percent'])

0        60.6
1        59.2
2        45.6
3        51.1
4        60.6
        ...  
5195    100.0
5196    100.0
5197    100.0
5198    100.0
5199     25.6
Name: Foreign_Percent, Length: 5200, dtype: float64


In [344]:
df.to_csv("movie_dataset_grossing_cleaned.csv", index=False)