In [1]:
from requests import get, ReadTimeout, ConnectTimeout, HTTPError, Timeout, ConnectionError
from bs4 import BeautifulSoup
import re
import pandas as pd
from time import sleep, time
import time
from random import randint
from IPython.core.display import clear_output
from warnings import warn
import numpy as np
from functools import wraps
from colorama import Fore

In [None]:
def retry(ExceptionToCheck, tries=20, delay=3, backoff=2, logger=None):
  """
    Modified from source: http://wiki.python.org/moin/PythonDecoratorLibrary#Retry

    Objective
    ----------
    Exponential backoff function.

    Parameters
    ----------
    ExceptionToCheck : Exception or tuple
      the exception to check. may be a tuple of exceptions to check
      Possible values:
        ConnectionResetError, (TimeoutError, ConnectionError)
    tries : int
      number of times to try (not retry) before giving up
    delay : int
      initial delay between retries in seconds
    backoff: int
      backoff multiplier
      E.g. value of 2 will double the delay each retry
    logger : logging.Logger instance
      logger to use
  """
  def deco_retry(f):
    @wraps(f)
    def f_retry(*args, **kwargs):
      mtries, mdelay = tries, delay
      while mtries > 1:
        try:
          return f(*args, **kwargs)
        except ExceptionToCheck:
          msg = "%s, Retrying in %d seconds..." % (str(ExceptionToCheck), mdelay)
          if logger:
            #logger.exception(msg) # would print stack trace
            logger.warning(msg)
          else:
            print(msg)
          time.sleep(mdelay)
          mtries -= 1
          mdelay *= backoff
        return f(*args, **kwargs)
      return f_retry  # true decorator
  return deco_retry  

In [3]:
# Store data in lists
def scrapeMovies(links, path, thread):
  """
    Objective
    ----------
    Crawl data in a timely manner from IMDb.

    Parameters
    ----------
    links : list
      specific URLs of web pages to crawl.
    path : string
      where to save the file.
    thread : string
      unique thread number.
  """
  movies = pd.DataFrame()
  movies.to_csv(f"{path}full_{thread}.csv")
  requests = 0
  #for every page
  count=0
  for link in links:
      count+=1
#       movies = pd.read_csv(f"{path}full_two.csv")
      #make get request
      
      @retry((ReadTimeout, ConnectTimeout, HTTPError, Timeout, ConnectionError), tries=20, delay=2,backoff=2)
      def get_response():
        response = get(link)
        return response
      response = get_response()

      # parse the content of request
      page_html = BeautifulSoup(response.text, 'html.parser')

      #select all 250 movie containers from a single page
      mv_containers = page_html.find_all('div', class_ = 'lister-item mode-advanced')

      # Extract data from indiv. movie containers
      for container in mv_containers:
          count+=1
          profile = container.h3.a['href']
          @retry((ReadTimeout, ConnectTimeout, HTTPError, Timeout, ConnectionError), tries=20, delay=2, backoff=2)
          def get_details():
            details = get("https://www.imdb.com" + profile)
            return details
          details = get_details()
          # parse the content of request
          details_html = BeautifulSoup(details.text, 'html.parser')

    #         '''
    #             GRAB MOVIE NAME
    #         '''
          name = container.h3.a.text
          print(Fore.GREEN + f"{name}({count})")

          '''
            GRAB PLOT SYNOPSIS
          '''
          try:
            synopsis_link = profile
            @retry((ReadTimeout, ConnectTimeout, HTTPError, Timeout, ConnectionError), tries=20, delay=2, backoff=2)
            def get_synopsis():
              synopsis = get("https://www.imdb.com/" + synopsis_link + "plotsummary")
              return synopsis
            synopsis = get_synopsis()
            synopsis_html = BeautifulSoup(synopsis.text, 'html.parser')

            plot_synopsis_content = synopsis_html.find("ul", {"id": "plot-synopsis-content"}).li.text
            plot_synopsis_content = plot_synopsis_content.strip()
            if plot_synopsis_content[:58] != "It looks like we don't have a Synopsis for this title yet.":
              print(Fore.GREEN + 'Plot Synopsis')
            else:
              plot_synopsis_content = np.nan
              print(Fore.RED + 'Plot Synopsis')
          except:
            plot_synopsis_content = np.nan
            print(Fore.RED + 'Plot Synopsis')
              
              
          '''
              GRAB SUMMARY
          '''
          
          try:
            summary = details_html.find('div', class_='summary_text').text.strip()
            print(Fore.GREEN + 'Summary')
          except:
            summary = np.nan
            print(Fore.RED + 'Summary')
            
          '''
            GRAB LANGUAGES
          '''
          
          try:
            languages = details_html.find(text='Language:').parent.parent.text.strip().replace("Language:", "").replace("\n", "")
            print(Fore.GREEN + 'Languages')
          except:
            languages = np.nan
            print(Fore.RED + 'Languages')
              
              
          '''
              GRAB BOX OFFICE GROSS
          '''
          try:
              #Gross
              bo = details_html.find(text='Gross USA:').parent.findNext('span').decompose()
              bo_gross = details_html.find(text='Gross USA:').parent.parent.text.strip()
              bo_gross = bo_gross.replace("Gross USA:", "").replace(',', '').strip()
              dom_gross = bo_gross
              print(Fore.GREEN + 'Box Office Gross: {}'.format(bo_gross))

          except:
              dom_gross = np.nan
              print(Fore.RED + 'Box Office Gross: Null')

          '''
              GRAB WORLDWIDE BOX OFFICE GROSS
          '''
          try:
              #Gross
              bo = details_html.find(text='Cumulative Worldwide Gross:').parent.findNext('span').decompose()
              bo_gross = details_html.find(text='Cumulative Worldwide Gross:').parent.parent.text.strip()
              bo_gross = bo_gross.replace("Cumulative Worldwide Gross:", "").replace(',', '').strip()
              int_gross = bo_gross
              print(Fore.GREEN + 'WorldWide Box Office Gross: {}'.format(bo_gross))

          except:
              int_gross = np.nan
              print(Fore.RED + 'WorldWide Box Office Gross: Null')


          '''
              GRAB RUNTIME
          '''

          try:
              runtime = container.find('span', class_ = 'runtime').text
              print(Fore.GREEN + 'Runtime : {}'.format(runtime))
          except:
              runtime = np.nan
              print(Fore.GREEN + 'Runtime : Null')
              
          '''
            GET SOUNDMIX
          '''
          try:
              #Gross
            sound_mix = details_html.find(text='Sound Mix:').parent.parent.text.replace("Sound Mix:", "").replace("\n", "").strip()
            sound_mix = sound_mix.strip().split("|")
            tot_sound_mix = []
            for val in sound_mix:
              tot_sound_mix.append(re.sub("[\(\[].*?[\)\]]", "", val.strip()))
            sound_mix = tot_sound_mix
            print(Fore.GREEN + 'Soundmix')

          except:
            sound_mix = np.nan
            print(Fore.RED + 'Soundmix')
            
            
          '''
            MOVIE STARS
          '''
          p_tags = container.find_all('p')
          try:
            stars = details_html.find('h4', text='Stars:').parent.find_all('a')
            all_stars = []
            all_anchs = []
            for value in stars:
              if 'See full cast' not in value.text:
                star = value.text
                anch = value['href']
                all_stars.append(star)
                all_anchs.append(anch)
                
                
            stars = all_stars
            star_anchs = all_anchs
            print(Fore.GREEN + 'Stars')

          except:
            stars = np.nan
            star_anchs = np.nan
            print(Fore.RED + 'Stars')
          '''
              GRAB CAST
          '''
          try: 
              cast_anchors = []
              @retry((ReadTimeout, ConnectTimeout, HTTPError, Timeout, ConnectionError), tries=20, delay=2, backoff=2)
              def get_cast():
                  cast = get("https://www.imdb.com/" + profile + "fullcredits")
                  return cast
              cast = get_cast()
              cast_html = BeautifulSoup(cast.text, 'html.parser')
              #cast
              cast_html.find('table', class_ = 'cast_list').findNext('tr').decompose()
              cast_members = []
              cast_odd = cast_html.find('table', class_ = 'cast_list').findAll('tr', class_='odd')
              cast_even = cast_html.find('table', class_ = 'cast_list').findAll('tr', class_='even')
              for cast in cast_odd:
                  cast_members.append(cast.findAll('td')[1].text.strip())
                  anchor = cast.findAll('td')[1].a['href']
                  cast_anchors.append(cast.findAll('td')[1].a['href'])
              for cast in cast_even:
                  cast_members.append(cast.findAll('td')[1].text.strip())
                  anchor = cast.findAll('td')[1].a['href']
                  cast_anchors.append(cast.findAll('td')[1].a['href'])
#               print(f"Cast Anchors: {cast_anchors}")
              print(Fore.GREEN + 'Cast')
          except:
              cast_anchors = np.nan
              cast_members = np.nan
              print(Fore.RED + 'Cast')
          '''
              GET DIRECTORS
          '''
          #Director
          try:
              director_anchors = []
              all_directors = []
              director_credits = []
              rows = cast_html.find('div', attrs = {'id': 'fullcredits_content'}).table.find_all('tr')
              for row in rows:
                director = row.find('td', class_ = 'name').a.text.replace("\n", "").strip()
                
                if re.search('[a-zA-Z]', director) and director != "":
                    all_directors.append(director.strip())
                    anchor = row.find('td', class_ = 'name').a['href']
                    try:
                      credit = row.find('td', class_='credit').text.strip()
                    except:
                      credit = np.nan
                    director_credits.append(credit)
                    director_anchors.append(anchor)
                else:
                    all_directors.append('null')
              cleaned_directors = []
              for director in all_directors:
                  if len(all_directors) > 0:
                      if director != "":
                          cleaned_directors.append(director)
                      else:
                        cleaned_directors.append(np.nan)
                  else:
                    cleaned_directors.append(np.nan)
              directors = cleaned_directors
              print(f"Anchors: {director_anchors}")
          except:
              director_anchors = np.nan
              directors = np.nan
              director_credits = np.nan
              print(Fore.RED + 'Directors')

          '''
              GRAB CINEMATOGRAPHER
          '''
          try: 
            cin_flag = False
            divs = cast_html.find_all('h4')
            for div in divs:
              if 'Cinematography by' in div.text:
                cin_flag = True
                cinematographer = div.find_next('table').tbody.tr.td.a.text
                cinematographer = cinematographer.replace("\n", "").strip()
                cinematographer_anchor = div.find_next('table').tbody.tr.td.a['href']
                print(Fore.GREEN + 'Cinematographer')
            if cin_flag == False:
                cinematographer = np.nan
                cinematographer_anchor = np.nan
                print(Fore.RED + 'Cinematographer')
          except:
            cinematographer = np.nan
            cinematographer_anchor = np.nan
            print(Fore.RED + 'Cinematographer')
            
          '''
              MUSIC BY
          '''
          try: 
            music_flag = False
            divs = cast_html.find_all('h4')
            for div in divs:
              if 'Music by' in div.text:
                music_flag = True
                musician = div.find_next('table').tbody.tr.td.a.text.strip()
                musician = musician.replace("\n", "")
                musician_anchor = div.find_next('table').tbody.tr.td.a['href']
                print(Fore.GREEN + 'Musician')
            if music_flag == False:
                musician = np.nan
                musician_anchor = np.nan
                print(Fore.RED + 'Musician')
          except:
            musician = np.nan
            musician_anchor = np.nan
            print(Fore.RED + 'Musician')

          '''
              PRODUCTION DESIGN
          '''
          try: 
            prod_flag = False
            divs = cast_html.find_all('h4')
            for div in divs:
              if 'Production Design by' in div.text:
                prod_flag = True
                prod_designer = div.find_next('table').tbody.tr.td.a.text.strip()
                prod_designer = prod_designer.replace("\n", "")
                prod_designer_anchor = div.find_next('table').tbody.tr.td.a['href']
                print(Fore.GREEN + 'Production Designer')
            if prod_flag == False:
                prod_designer = np.nan
                prod_designer_anchor = np.nan
                print(Fore.RED + 'Production Designer')
          except:
            prod_designer = np.nan
            prod_designer_anchor = np.nan
            print(Fore.RED + 'Production Designer')
            
          '''
              SET DECORATOR
          '''
          try: 
            set_flag = False
            divs = cast_html.find_all('h4')
            for div in divs:
              if 'Set Decoration by' in div.text:
                set_flag = True
                set_decorator = div.find_next('table').tbody.tr.td.a.text.strip()
                set_decorator = set_decorator.replace("\n", "")
                set_decorator_anchor = div.find_next('table').tbody.tr.td.a['href']
                print(Fore.GREEN + 'Set Decorator')
            if set_flag == False:
                set_decorator = np.nan
                set_decorator_anchor = np.nan
                print(Fore.RED + 'Set Decorator')
          except:
            set_decorator = np.nan
            set_decorator_anchor = np.nan
            print(Fore.RED + 'Set Decorator')


          '''
              COSTUME DESIGN
          '''
          try: 
            costume_flag = False
            divs = cast_html.find_all('h4')
            for div in divs:
              if 'Costume Design by' in div.text:
                costume_flag = True
                costume_designer = div.find_next('table').tbody.tr.td.a.text.strip()
                costume_designer = costume_designer.replace("\n", "")
                costume_designer_anchor = div.find_next('table').tbody.tr.td.a['href']
                print(Fore.GREEN + 'Costume Designer')
            if costume_flag == False:
              costume_designer = np.nan
              costume_designer_anchor = np.nan
              print(Fore.RED + 'Costume Designer')

          except:
            costume_designer = np.nan
            costume_designer_anchor = np.nan
            print(Fore.RED + 'Costume Designer')
            
            
          '''
              GRAB SEQUELS
          '''
          def return_values(follows_count):
              follows = []
              for val in follows_count:
                  if "Follows" in val:
                      follows.append(re.sub("\D", "", val))
              if follows:
                  sequels = []
                  count = int(follows[0])
                  containers = sequel_html.find("a", {"id": "follows"}).parent.findAll('div')
                  counter = 0
                  for container in containers:
                      counter +=1
                      if counter > int(count):
                          break
                      else:
                          sequels.append(container.a.text)
              return sequels
          try:
              @retry((ReadTimeout, ConnectTimeout, HTTPError, Timeout, ConnectionError), tries=20, delay=2, backoff=2)
              def get_sequel():
                  sequel = get("https://www.imdb.com/" + profile + "movieconnections")
                  return sequel
              sequel = get_sequel()
              sequel_html = BeautifulSoup(sequel.text, 'html.parser')
              try:
                  follows_count = sequel_html.find('a', attrs = {'href':'#follows'})
                  follows_count = sequel_html.find('a', attrs = {'href':'#follows'}).parent.text.split("\n")
                  sequels = return_values(follows_count)
                  print(Fore.GREEN + 'Sequel')
              except:
                  sequels = np.nan
                  print(Fore.RED + 'Sequel')
          except:
              sequels = np.nan
              print(Fore.RED + 'Sequel')
          
          
          '''
              GET PRODUCTION COMPANIES
          '''

          @retry((ReadTimeout, ConnectTimeout, HTTPError, Timeout, ConnectionError), tries=20, delay=2, backoff=2)
          def get_production():
              production = get("https://www.imdb.com/" + profile + "companycredits")
              return production
          production = get_production()
          production_html = BeautifulSoup(production.text, 'html.parser')
          try:
              production_comps = []
              production_companies = production_html.find("div", {"id": "company_credits_content"}).ul.findAll('a')
              for value in production_companies:
                production_comps.append(value.text)
              production_companies = production_comps
              print(Fore.GREEN + 'Production COs')
          except:
              production_companies = np.nan
              print(Fore.RED + 'Production COs')


          '''
              GET SCREEN WRITERS
          '''
          try:
              headers = cast_html.find_all('h4', class_='dataHeaderWithBorder')
              for idx, value in enumerate(headers):
                if re.sub("[\(\[].*?[\)\]]", "", value.text.replace("\n", "").strip()) == 'Writing Credits':
                  index = idx
              writing_header = headers[index]
              
              credit_containers = writing_header.find_next('table')
              writers_x = credit_containers.tbody.findAll('tr')
              cleaned_writers = []
              screen_anchors = []
              screen_credits = []

              for writer in writers_x:
                if writer.find('td', attrs = {'colspan':'3'}):
                  continue
                else:
                  answer = writer.find('td', class_ = 'name').a.text
                  answer = answer.strip()
                  cleaned_writers.append(answer)
                  try:
                    anchor = writer.find('td', class_ = 'name').a['href']
                    screen_anchors.append(anchor)
                  except:
                    screen_anchors.append(np.nan)
                  try:
                    credit = writer.find('td', class_='credit').text.strip()
                  except:
                    credit = np.nan
                  screen_credits.append(credit)
              screen_writers = cleaned_writers
              print(Fore.GREEN + 'Screen Writers')
          except Exception as e:
              print(f"Error: {e}")
              screen_writers = np.nan
              screen_anchors = np.nan
              screen_credits = np.nan
              print(Fore.RED + 'Screen Writers')

          '''
              GET PRODUCERS
          '''
          try:
              headers = cast_html.find_all('h4', class_='dataHeaderWithBorder')
              for idx, value in enumerate(headers):
                if value.text.strip() == 'Produced by':
                  index = idx
              writing_header = headers[index]
              
              credit_containers = writing_header.find_next('table')
              writers_x = credit_containers.tbody.findAll('tr')
              cleaned_producers = []
              prod_anchors = []
              prod_credits = []

              for writer in writers_x:
                if writer.find('td', attrs = {'colspan':'3'}):
                  continue
                else:
                  answer = writer.find('td', class_ = 'name').text.strip()
                  answer = answer.strip()
                  cleaned_producers.append(answer)
                  
                  try:
                    credit = writer.find('td', class_ = 'credit').text.strip()
                    prod_credits.append(credit)
                  except:
                    prod_credits.append(np.nan)
                  try:
                    anchor = writer.find('td', class_ = 'name').a['href']
                    prod_anchors.append(anchor)
                  except:
                    prod_anchors.append(np.nan)
              producers = cleaned_producers
              print(Fore.GREEN + 'Producers')
          except:
              producers = np.nan
              prod_anchors = np.nan
              prod_credits = np.nan
              print(Fore.RED + 'Producers')
              

              
          '''
              GET BUDGET
          '''

          try:
              bo_budget = details_html.find(text='Budget:').parent.findNext('span').decompose()
              budget = details_html.find(text='Budget:').parent.parent.text.strip()
              movie_budget = budget.replace(',', '').replace("Budget:", "")
              print(Fore.GREEN + 'Budget: {}'.format(movie_budget))

          except:
              movie_budget = np.nan
              print(Fore.RED + 'Budget')



          '''
              GET RELEASE DATES
          '''
          try:
              details_html.find(text='Release Date:').parent.findNext('span').decompose()
              date = details_html.find(text='Release Date:').parent.parent.text.strip()
              date = re.sub(r'\([^)]*\)', '', date.replace("Release Date:", ""))[:-1]
              release_date = date
              print(Fore.GREEN + 'Release: {}'.format(date))
          except:
              release_date = np.nan
              print(Fore.RED + 'Release')


          '''
              GET GENRE
          '''
          try:
              genres = container.find('div', class_ = 'lister-item-content').p.find('span', class_ = 'genre').text.strip().split()
              genre_list = []
              for genre in genres:
                genre_list.append(genre.replace(",", "").strip())
              movie_genre = genre_list
              print(Fore.GREEN + 'Genre: {}'.format(genre))
          except:
              movie_genre = np.nan
              print(Fore.RED + 'Genre')

          '''
              GET SPECIAL EFFECT COMPANIES
          '''

          try:
              comps_x = production_html.find("h4", {"id": "specialEffects"}).findNext('ul').findAll('li')
              cleaned_special_effects = []
              for comp in comps_x:
                  cleaned_special_effects.append(comp.a.text)
              special_effects = cleaned_special_effects
              print(Fore.GREEN + 'Special Effects COs')
          except:
            special_effects = np.nan

          '''
              GET MPAA RATING
          '''
          try:
              mpaa = container.find('div', class_='lister-item-content').p.find('span', class_ = 'certificate').text
              print(Fore.GREEN + 'MPAA: {}'.format(mpaa))
          except:
              mpaa = np.nan
              print(Fore.RED + 'MPAA')

          '''
            DISTRIBUTOR
          '''
          try:
            distributor = production_html.find('h4', {'id': 'distributors'}).find_next('ul').li.a.text
            print(Fore.GREEN + 'Distributor')
          except:
            distributor = np.nan
            print(Fore.RED + 'Distributor')
            
            

          '''
            COUNT NUMBER OF CAST & CREW
          '''
          try:
            # find all headers for cast & crew
            headers = cast_html.find_all('h4', class_='dataHeaderWithBorder')
            full_cast_dict = {}
            for header in headers:
              try:
                if header.get('id')=='cast':
                  cast_name = 'Cast'
                  cast = len(cast_html.find('table', class_='cast_list').find_all('tr', {'class':'odd'}))
                  cast = cast + len(cast_html.find('table', class_='cast_list').find_all('tr', {'class':'even'}))
                  full_cast_dict[cast_name] = cast
                else:
                  cast = header.find_next('table').find_all('td', {'class': 'name'})
                  cast_name = re.sub("[\(\[].*?[\)\]]", "", header.text.strip()).replace("By", "").replace("by", "").strip()
                  full_cast_dict[cast_name] = len(cast)
              except Exception as e:
                print(f"Error: {e}")
                full_cast_dict[header] = np.nan
                print(Fore.RED + 'Cast Count')
            print(Fore.GREEN + 'Cast Count')
          except Exception as e:
#             print(f"Error: {e}")
            full_cast_dict = np.nan
            print(Fore.RED + 'Cast Count')
            
            
          '''
            COUNT NUMBER OF COMPANIES
          '''
          try:
            # find all headers for cast & crew
            headers = production_html.find_all('h4', class_='dataHeaderWithBorder')
            full_comp_dict = {}
            for header in headers:
              comps = header.find_next('ul').find_all('li')
              comp_name = re.sub("[\(\[].*?[\)\]]", "", header.text.strip()).replace("By", "").replace("by", "").strip()
              full_comp_dict[comp_name] = len(comps)
            print(Fore.GREEN + 'Company Count')
          except Exception as e:
#             print(f"Error: {e}")
            full_comps_dict = np.nan
            print(Fore.RED + 'Company Count')
            
            
          '''
            COUNT NUMBER OF CONNECTIONS
          '''
          
          # NUMBER OF VERSIONS
          try:
            elements = sequel_html.find('div', class_='jumpto').text.strip().replace("\n", "").replace("\xa0", "").replace("Jump to:", "").split("|")
            versionFlag = False
            for element in elements:
              if 'Version' in element:
                versionFlag = True
                version_count = element[element.find("(")+1:element.find(")")]
            if versionFlag == False:
              version_count=0
              print(Fore.RED + "Version")
            else:
              print(Fore.GREEN + f"Version: {version_count}")
          except Exception as e:
#             print(f"Error: {e}")
            print(Fore.RED + "Version")
            version_count = 0
          # NUMBER OF REFERENCES
          try:
            elements = sequel_html.find('div', class_='jumpto').text.strip().replace("\n", "").replace("\xa0", "").replace("Jump to:", "").split("|")
            referenceFlag = False
            for element in elements:
              if 'References' in element:
                referenceFlag = True
                references_count = element[element.find("(")+1:element.find(")")]
            if referenceFlag == False:
              references_count = 0
              print(Fore.RED + "References")
            else:
              print(Fore.GREEN + f"References Count: {references_count}")
          except:
            references_count = 0
            print(Fore.RED + "References")
        
          clear_output(wait = True)
          movie = pd.DataFrame({
            'name': [name],
            'profile': [profile],
            'budget': [movie_budget],
            'plot': [plot_synopsis_content],
            'summary': [summary],
            'box_office': [dom_gross],
            'int_office': [int_gross],
            'runtime': [runtime],
            'cast_members': [cast_members],
            'cast_anchors': [cast_anchors],
            'sequels': [sequels],
            'pro_comp': [production_companies],
            'directors': [directors],
            'director_anchors': [director_anchors],
            'director_credits': [director_credits],
            'cinematographer': [cinematographer],
            'cin_anch': [cinematographer_anchor],
            'musician': [musician],
            'musician_anchor': [musician_anchor],
            'prod_designer': [prod_designer],
            'prod_designer_anchor': [prod_designer_anchor],
            'costume_designer': [costume_designer],
            'costume_designer_anchor': [costume_designer_anchor],
            'sequel': [sequels],
            'screen_writers': [screen_writers],
            'screen_anchors': [screen_anchors],
            'screen_credits': [screen_credits],
            'release': [release_date],
            'genre': [movie_genre],
            'spec_eff': [special_effects],
            'distributor': [distributor],
            'mpaa': [mpaa],
            'sound_mix': [sound_mix],
            'producers': [producers],
            'prod_anchors': [prod_anchors],
            'prod_credits': [prod_credits],
            'stars': [stars],
            'star_anchs': [star_anchs],
            'cast_count': [full_cast_dict],
            'comp_count': [full_comp_dict],
            'version_count': [version_count],
            'references_count': [references_count],
            'languages': [languages]
          })
          movies = movies.append(movie, ignore_index=True)
          movies.to_csv(f"{path}full_{thread}.csv", index=False)
  return movies

In [4]:
links = pd.read_csv("../../data/links.csv")
len(list(set(links['link'].values)))
links = list(dict.fromkeys(list(links['link'].values)))
links.insert(0, 'https://www.imdb.com/search/title/?title_type=feature&sort=boxoffice_gross_us,desc&count=250&ref_=adv_prv')

In [5]:
def thread(thread):
  """
    Objective
    ----------
    Return parameters for scrapeMovies function

    Parameters
    ----------
    thread : string
      unique thread number.
  """
  path = "../../data/"
  total = len(links)
  thread_length = int(len(links)/6)
  if thread=='one':
    result = links[:thread_length]
  elif thread=='two':
    result = links[thread_length:thread_length*2]
  elif thread=='three':
    result = links[thread_length*2:thread_length*3]
  elif thread=='four':
    result = links[thread_length*3:thread_length*4]
  elif thread=='five':
    result = links[thread_length*4:thread_length*5]
  elif thread=='six':
    result = links[thread_length*5:thread_length*6]
  return result, path, thread

In [None]:
links, path, thread = thread('one')
scrapeMovies(links, path, thread)

[32mBeyond the Reach(12702)
[31mPlot Synopsis
[32mSummary
[32mLanguages
[32mBox Office Gross: $45895
[32mWorldWide Box Office Gross: $1100432
[32mRuntime : 91 min
[32mSoundmix
[32mStars
[32mCast
Anchors: ['/name/nm1515266/']
[32mCinematographer
[32mMusician
[32mProduction Designer
[32mSet Decorator
[32mCostume Designer
[31mSequel
[32mProduction COs
[32mScreen Writers
[32mProducers
[31mBudget
[32mRelease:  17 April 2015
[32mGenre: Thriller
[32mSpecial Effects COs
[32mMPAA: R
[32mDistributor
[32mCast Count
[32mCompany Count
[31mVersion
[31mReferences
