In [0]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd
from google.colab import files

In [0]:
#Function to scrape urls
def get_urls(pages): #pages as int
  urls = []
  for i in range(1, pages + 1):
    response = requests.get("https://flixable.com/genre/movies/?min-rating=0&min-year=0&max-year=3000&order=date&page=" + str(i))
    if(response.ok):
      data = response.text
      soup = BeautifulSoup(data)
      for link in soup.find_all("a", {"class": "title"}, href=True):
        link = "http://www.flixable.com" + link["href"]
        urls.append(link)
  return urls

Insert api key in following cell:

In [0]:
#Function to get extra info
def get_extra_info(title, year): #title as string, year as string
  apikey = #apikey goes here
  search_url = "http://www.omdbapi.com/?t={}&y={}&apikey={}".format(title.replace(" ", "+"), year, apikey)
  imdb_response = requests.get(search_url)
  if(imdb_response.ok):
    imdb_data = imdb_response.json()
    return imdb_data

In [0]:
#Function to scrape
def get_info(urls): #urls as list of links
  titles = []
  years = []
  plots = []
  mpaa_ratings = []
  genres = []
  production_countries = []
  imdb_ratings = []
  added_to_netflixs = []
  realeaseds=[]
  runtimes=[]
  directors=[]
  writers=[]
  actors=[]
  languages=[]
  awards=[]
  rotten_tomatoes_ratings=[]
  metacritic_ratings=[]
  productions = []
  types=[]

  for link in urls:
    print("Movie: {}/{}".format(urls.index(link)+1,len(urls)))
    response = requests.get(link)
    if(response.ok):
        data = response.text
        soup = BeautifulSoup(data)

        #Title
        title = soup.find_all("h1", {"class": "mb-3"})
        title = [x.get_text().strip() for x in title]
        titles.append(title[0])
        
        #Year
        year = soup.find_all("span", {"class": "font-weight-bold mr-2"})
        year = [x.get_text() for x in year]
        years.append(year[0])

        #MPAA Rating
        mpaa_rating = soup.find_all("span", {"class": "border border-dark rounded font-weight-bold px-1 mr-2"})
        mpaa_rating = [x.get_text() for x in mpaa_rating]
        mpaa_ratings.append(mpaa_rating[0])

        #Added to Neflix
        added_to_netflix = soup.find_all("div", {"class": "mb-4"})
        added_to_netflix = [x.get_text().split(":")[1].replace("\n","") for x in added_to_netflix]
        added_to_netflixs.append(added_to_netflix[0])

        #Extra info using omdbapi.com
        extra_info = get_extra_info(title[0], year[0])

        #Print this for debugging purposes
        #if extra_info["Response"] == "True":
        #  print("Extra info found for: {} ({})".format(title[0], year[0]))
        #else:
        #  print("No extra info found for: {} ({}) - Filling with nans".format(title[0], year[0]))

        
        #Release date
        try:
          realeaseds.append(extra_info["Released"])
        except:
          realeaseds.append(np.nan)
        
        #Runtime
        try:
          runtimes.append(extra_info["Runtime"])
        except:
          runtimes.append(np.nan)
        
        #Director
        try:
          directors.append(extra_info["Director"])
        except:
          directors.append(np.nan)
        
        #Writers
        try:
          writers.append(extra_info["Writer"])
        except:
          writers.append(np.nan)

        #Actors
        try:
          actors.append(extra_info["Actors"])
        except:
          actors.append(np.nan)
        
        #Language
        try:
          languages.append(extra_info["Language"])
        except:
          languages.append(np.nan)

        #Awards
        try:
          awards.append(extra_info["Awards"])
        except:
          awards.append(np.nan)

        #IMDB Ratings
        try:
          imdb_ratings.append(extra_info["Ratings"][0]["Value"])
        except:
          imdb_ratings.append(np.nan)
        
        #Rotten Tomatoes Ratings
        try:
          rotten_tomatoes_ratings.append(extra_info["Ratings"][1]["Value"])
        except:
          rotten_tomatoes_ratings.append(np.nan)
        
        #Metracritic Ratings
        try:
          metacritic_ratings.append(extra_info["Ratings"][2]["Value"])
        except:
          metacritic_ratings.append(np.nan)

        #Production
        try:
          productions.append(extra_info["Production"])
        except:
          productions.append(np.nan)
        
        #Type
        try:
          types.append(extra_info["Type"])
        except:
          types.append(np.nan)
        
        #Plot
        try:
          plots.append(extra_info["Plot"])
        except:
          plots.append(np.nan)

        #Production Country
        try:
          production_countries.append(extra_info["Country"])
        except:
          production_countries.append(np.nan)

        #Genre
        try:
          genres.append(extra_info["Genre"])
        except:
          genres.append(np.nan)

  #Create Dataframe
  output = pd.DataFrame({"Title":titles,
                         "Year":years,
                         "Plot":plots,
                         "MPAA_Rating":mpaa_ratings,
                         "Genres":genres, #Fix this
                         "Production_Country":production_countries,
                         "IMDB Ratings":imdb_ratings,
                         "Added_to_Netflix":added_to_netflixs,
                         "Released_Date":realeaseds,
                         "Runtime":runtimes,
                         "Director":directors,
                         "Writer":writers,
                         "Actors":actors,
                         "Language":languages,
                         "Awards":awards,
                         "Rotten_Tomatoes_Rating":rotten_tomatoes_ratings,
                         "Metacritic_Rating":metacritic_ratings,
                         "Production":productions,
                         "Type":types                        
                       })
  return output

Usage: change number of pages to scrape, data will be stored in df

In [159]:
%%time
pages = 2 #Number of pages to scrape
urls = get_urls(pages)
df = get_info(urls)

Movie: 1/80
Movie: 2/80
Movie: 3/80
Movie: 4/80
Movie: 5/80
Movie: 6/80
Movie: 7/80
Movie: 8/80
Movie: 9/80
Movie: 10/80
Movie: 11/80
Movie: 12/80
Movie: 13/80
Movie: 14/80
Movie: 15/80
Movie: 16/80
Movie: 17/80
Movie: 18/80
Movie: 19/80
Movie: 20/80
Movie: 21/80
Movie: 22/80
Movie: 23/80
Movie: 24/80
Movie: 25/80
Movie: 26/80
Movie: 27/80
Movie: 28/80
Movie: 29/80
Movie: 30/80
Movie: 31/80
Movie: 32/80
Movie: 33/80
Movie: 34/80
Movie: 35/80
Movie: 36/80
Movie: 37/80
Movie: 38/80
Movie: 39/80
Movie: 40/80
Movie: 41/80
Movie: 42/80
Movie: 43/80
Movie: 44/80
Movie: 45/80
Movie: 46/80
Movie: 47/80
Movie: 48/80
Movie: 49/80
Movie: 50/80
Movie: 51/80
Movie: 52/80
Movie: 53/80
Movie: 54/80
Movie: 55/80
Movie: 56/80
Movie: 57/80
Movie: 58/80
Movie: 59/80
Movie: 60/80
Movie: 61/80
Movie: 62/80
Movie: 63/80
Movie: 64/80
Movie: 65/80
Movie: 66/80
Movie: 67/80
Movie: 68/80
Movie: 69/80
Movie: 70/80
Movie: 71/80
Movie: 72/80
Movie: 73/80
Movie: 74/80
Movie: 75/80
Movie: 76/80
Movie: 77/80
Movie: 7

Overview of df

In [160]:
df.head(10)

Unnamed: 0,Title,Year,Plot,MPAA_Rating,Genres,Production_Country,IMDB Ratings,Added_to_Netflix,Released_Date,Runtime,Director,Writer,Actors,Language,Awards,Rotten_Tomatoes_Rating,Metacritic_Rating,Production,Type
0,Fortune Feimster: Sweet & Salty,2020,,TV-MA,Comedy,USA,,"January 21, 2020",21 Jan 2020,61 min,Krysia Plonka,,Fortune Feimster,English,,,,,movie
1,KD (A) Karuppudurai,2019,,TV-14,,,,"January 21, 2020",,,,,,,,,,,
2,Motichoor Chaknachoor,2019,A hilarious story of a 36-year-old jobless man...,TV-14,"Comedy, Romance",India,5.8/10,"January 20, 2020",15 Nov 2019,150 min,Debamitra Biswal,Meghvrat Singh Gurjar,"Vibha Chhibber, Bhumika Dube, Devansh Kumar, U...",,,,,,movie
3,WHAT DID JACK DO?,2020,,TV-14,,,,"January 20, 2020",,,,,,,,,,,
4,The Bling Ring,2013,"Inspired by actual events, a group of fame-obs...",R,"Biography, Crime, Drama","USA, UK, France, Germany, Japan",5.6/10,"January 18, 2020",21 Jun 2013,90 min,Sofia Coppola,"Sofia Coppola, Nancy Jo Sales (based on the Va...","Katie Chang, Israel Broussard, Emma Watson, Cl...",English,4 wins & 7 nominations.,59%,66/100,A24 Films,movie
5,A Fall from Grace,2020,"Disheartened since her ex-husband's affair, Gr...",TV-MA,Thriller,,,"January 17, 2020",17 Jan 2020,120 min,Tyler Perry,Tyler Perry,"Tyler Perry, Adrian Pasdar, Cicely Tyson, Mehc...",,,,,,movie
6,Deadcon,2019,The horrors and isolation of being a social me...,TV-MA,Horror,USA,2.8/10,"January 16, 2020",15 Jun 2019,78 min,Caryn Waechter,Scotty Landes,"Emma Barrett, Kherrington Briggs, Mai Brunelle...",English,,,,,movie
7,Get Him to the Greek,2010,A record company intern is hired to accompany ...,R,"Comedy, Music",USA,6.4/10,"January 16, 2020",04 Jun 2010,109 min,Nicholas Stoller,"Nicholas Stoller, Jason Segel (characters)","Russell Brand, Rose Byrne, Tyler McKinney, Zoe...",English,14 nominations.,72%,65/100,Universal Pictures,movie
8,Hop,2011,"E.B., the Easter Bunny's teenage son, heads to...",PG,"Animation, Adventure, Comedy, Family, Fantasy","USA, India, Malaysia, Taiwan, Canada, UK",5.4/10,"January 16, 2020",01 Apr 2011,95 min,Tim Hill,"Cinco Paul (screenplay), Ken Daurio (screenpla...","James Marsden, Russell Brand, Kaley Cuoco, Han...","English, Chinese, Spanish, French, Ukrainian",2 nominations.,25%,41/100,Universal Pictures,movie
9,Jezebel,2019,"In the last days of her mother's life, 19 year...",TV-MA,Drama,USA,7.5/10,"January 16, 2020",09 Mar 2019,,Numa Perrier,Numa Perrier,"Tiffany Tenille, Numa Perrier, Brett Gelman, S...",English,,100%,,House of Numa,movie


Export df to csv

In [0]:
df.to_csv("movie_data.csv", encoding='utf-8', index=False)
files.download("movie_data.csv")