In [1]:
from bs4 import BeautifulSoup as bts
import requests
import pandas as pd
import pickle 
import numpy as np
from datetime import datetime
import re

In [2]:
links = pd.read_pickle('links.pickle')

In [3]:
len(links)

3152

In [4]:
def convert_raters(rating):
    if 'M' in rating:
        r = float(rating.replace('M', ' '))*1000000
        return r
    elif 'K' in rating:
        r = float(rating.replace('K', ' '))*1000
        return r
    else:
        return None

In [5]:
def convert_runtime(runtime):
    runtime_str = runtime.split(' ')
    if len(runtime_str) > 2 :
        runtime = int(runtime_str[0])*60 + int(runtime_str[2])
    else:
        runtime = int(runtime_str[0])*60
    return runtime

In [6]:
def get_movie_data(links):
    headers = [
        "title",
        "rating",
        "mpaa",
        "raters",
        "genres",
        "director",
        "writers",
        "stars",
        "release date",
        "origin",
        "language",
        'budget',
        'opening weekend',
        'gross usa' ,
        'gross ww',
        'runtime(min)',
        'production co'
    ]

    data = []

    for link in links:
        temp = []
        result = requests.get(link)
        movie = bts(result.text, 'html.parser')

        # Title
        try:
            temp.append(movie.find('div', class_='sc-94726ce4-1 iNShGo').h1.text)
        except:
            temp.append(None)   

        # Rating    
        try:
            temp.append(movie.find('div', class_='sc-7ab21ed2-2 kYEdvH').span.text)
        except:
            temp.append(None)

        # MPAA Info
        try:
            temp.append(movie.find('li', { 'data-testid' : 'storyline-certificate'}).a.findNext().text)
        except:
            temp.append(None)

        # Raters
        try:
            temp.append(convert_raters(movie.find('div', class_='sc-7ab21ed2-3 dPVcnq').text))
        except:
            temp.append(None)

        # Genres
        try:
            genres = []
            for genre in movie.find('li', {"data-testid" : "storyline-genres"}).ul:
                genres.append(genre.text)
            temp.append(genres)
        except:
            temp.append(None)

        # Director
        try:
            temp.append(movie.find_all('li', {'data-testid' : 'title-pc-principal-credit'})[0].a.text)
            #temp.append(movie.find('ul', class_="ipc-metadata-list ipc-metadata-list--dividers-all sc-11eed019-10 fcovio ipc-metadata-list--base").li.li.text)
        except:
            temp.append(None)

        # Writers
        try:
            writers = []
            for writer in movie.find_all('li', {'data-testid' : 'title-pc-principal-credit'})[1].ul.find_all('li'):
                writers.append(writer.text)
            temp.append(writers)
        except:
            temp.append(None)

        # Stars
        try:
            stars = []
            for star in movie.find_all('li', {'data-testid' : 'title-pc-principal-credit'})[2].ul.find_all('li'):
                stars.append(star.text)
            temp.append(stars)
        except:
            temp.append(None)

        # Release Date
        try:
            date = movie.find('section', {'data-testid' : 'Details'}).find(text = 'Release date').findNext().text
            date = (date.split(" ")[1].split(",")[0] + "-" + date.split(" ")[0] + "-" + date.split(" ")[2])
            date = datetime.strptime(date, '%d-%B-%Y')
            temp.append(date)
        except:
            temp.append(None)

        # Country of Origin
        try:
            origin_regex = re.compile('origin')
            origins = []
            for origin in movie.find('section', {'data-testid' : 'Details'}).find(text = origin_regex).findNext().find_all('li'):
                origins.append(origin.text)
            temp.append(origins)
        except:
            temp.append(None)  

        # Language
        try:
            lang_regex = re.compile('Lang')
            langs = []
            for lang in movie.find('section', {'data-testid' : 'Details'}).find(text = lang_regex).findNext().find_all('li'):
                langs.append(lang.text)
            temp.append(langs)
        except:
            temp.append(None) 
        
        # Budget
        try:
            budget_str = movie.find('div', {'data-testid' : 'title-boxoffice-section'}).find(text = 'Budget').findNext().text
            budget = int(budget_str.replace('$','').replace(',', '').split(' ')[0])
            temp.append(budget)
        except:
            temp.append(None)

        # Opening Weekend
        try:
            opening_regex = re.compile('Opening')
            opening_str = movie.find('div', {'data-testid' : 'title-boxoffice-section'}).find(text = opening_regex).findNext().span.text
            opening = int(opening_str.replace('$','').replace(',', ''))
            temp.append(opening)
        except:
            temp.append(None)

        # Gross US & Canada
        try:
            gross_regex = re.compile('Gross US')
            gross_us_str = movie.find('div', {'data-testid' : 'title-boxoffice-section'}).find(text = gross_regex).findNext().span.text
            gross_us = int(gross_us_str.replace('$','').replace(',',''))
            temp.append(gross_us)
        except:
            temp.append(None)

        # Gross Worldwide
        try:
            gross_ww_regex = re.compile('Gross worldwide')
            gross_ww_str = movie.find('div', {'data-testid' : 'title-boxoffice-section'}).find(text = gross_ww_regex).findNext().span.text
            gross_ww = int(gross_ww_str.replace('$','').replace(',',''))
            temp.append(gross_ww)
        except:
            temp.append(None)    

        # Runtime
        try:
            runtime_regex = re.compile('Run')
            runtime_str = movie.find('div', {'data-testid' : 'title-techspecs-section'}).find(text = runtime_regex).findNext().text
            runtime = convert_runtime(runtime_str)
            temp.append(runtime)
        except:
            temp.append(None)

        # Production Companies
        try:
            production_co = []
            for company in movie.find('li', {'data-testid' : 'title-details-companies'}).ul.find_all('li'):
                production_co.append(company.text)
            temp.append(production_co)
        except:
            temp.append(None)

        movie_dict = dict(zip(headers, temp))
        data.append(movie_dict)
    
    return data

In [7]:
movies_data = get_movie_data(links[3000:3153])

In [8]:
movies = pd.DataFrame(movies_data)
movies.head()

Unnamed: 0,title,rating,mpaa,raters,genres,director,writers,stars,release date,origin,language,budget,opening weekend,gross usa,gross ww,runtime(min),production co
0,"4 luni, 3 saptamâni si 2 zile",7.9,Not Rated,59000.0,[Drama],Cristian Mungiu,[Cristian Mungiu],"[Anamaria Marinca, Laura Vasiliu, Vlad Ivanov]",2008-01-11,"[Romania, Belgium]",[Romanian],,51712.0,1198208.0,10174839.0,113,"[Mobra Films, Centrul National al Cinematograf..."
1,Charlie St. Cloud,6.4,Rated PG-13 for language including some sexual...,62000.0,"[Drama, Fantasy, Romance]",Burr Steers,"[Craig Pearce(screenplay), Lewis Colick(screen...","[Zac Efron, Kim Basinger, Charlie Tahan]",2010-09-24,"[Canada, United States]",[English],44000000.0,12381585.0,31162545.0,48190704.0,100,"[Universal Pictures, Relativity Media, Marc Pl..."
2,Get the Gringo,6.9,"Rated R for strong bloody violence, pervasive ...",107000.0,"[Action, Crime, Drama]",Adrian Grunberg,"[Mel Gibson, Stacy Perskie, Adrian Grunberg]","[Mel Gibson, Kevin Balmore, Daniel Giménez Cacho]",2012-06-22,"[United States, Mexico]","[English, Spanish]",20000000.0,,,8826837.0,96,"[Icon Productions, Icon Productions, Airborne ..."
3,Gridiron Gang,7.0,Rated PG-13 for some startling scenes of viole...,62000.0,"[Biography, Crime, Drama, Sport]",Phil Joanou,"[Jeff Maguire, Jac Flanders(film ""Gridiron Gan...","[Dwayne Johnson, Xzibit, L. Scott Caldwell]",2021-03-01,[United States],[English],30000000.0,14414630.0,38432823.0,41480851.0,125,"[Columbia Pictures Industries, Original Film, ..."
4,The X Files: I Want to Believe,5.9,Rated PG-13 for violent and disturbing content...,91000.0,"[Crime, Drama, Horror, Mystery, Sci-Fi, Thriller]",Chris Carter,"[Frank Spotnitz, Chris Carter(television serie...","[David Duchovny, Gillian Anderson, Billy Conno...",2008-09-12,"[United States, Canada]","[English, Russian, Czech]",30000000.0,10021753.0,20982478.0,69363381.0,104,"[Twentieth Century Fox, Ten Thirteen Productio..."


In [9]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 152 entries, 0 to 151
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   title            152 non-null    object        
 1   rating           152 non-null    object        
 2   mpaa             151 non-null    object        
 3   raters           152 non-null    float64       
 4   genres           152 non-null    object        
 5   director         152 non-null    object        
 6   writers          152 non-null    object        
 7   stars            152 non-null    object        
 8   release date     151 non-null    datetime64[ns]
 9   origin           152 non-null    object        
 10  language         152 non-null    object        
 11  budget           96 non-null     float64       
 12  opening weekend  117 non-null    float64       
 13  gross usa        120 non-null    float64       
 14  gross ww         144 non-null    float64  

In [10]:
movies[movies['release date'].isnull()]

Unnamed: 0,title,rating,mpaa,raters,genres,director,writers,stars,release date,origin,language,budget,opening weekend,gross usa,gross ww,runtime(min),production co
63,Dara iz Jasenovca,8.8,Rated R for strong and disturbing violent cont...,81000.0,"[Drama, War]",Predrag Antonijevic,[Natasa Drakulic],"[Biljana Cekic, Zlatan Vidovic, Anja Stanic]",NaT,[Serbia],"[Serbian, German]",3600000.0,16710.0,52786.0,52786.0,130,"[Dandelion Production Inc, Film Danas, Komuna]"


In [11]:
with open ('movies_7.pickle', 'wb') as f:
    pickle.dump(movies, f)

In [12]:
# df[df['opening weekend'].isnull()]
# df['opening weekend'].fillna(np.nan, inplace=True)
# df