## For movies released in the year 2020
#### Using beautiful soup to fetch data from article of wikipedia

In [1]:
import pandas as pd
import numpy as np
import requests
import bs4 as bs
import urllib.request

In [2]:
link = "https://en.wikipedia.org/wiki/List_of_American_films_of_2020"

In [3]:
source = urllib.request.urlopen(link).read()
soup = bs.BeautifulSoup(source,'lxml')

In [4]:
tables = soup.find_all('table',class_='wikitable sortable')

In [5]:
print(f" total tables: {len(tables)} and their types: {type(tables[0])}")

 total tables: 4 and their types: <class 'bs4.element.Tag'>


In [6]:
df1 = pd.read_html(str(tables[0]))[0]
df2 = pd.read_html(str(tables[1]))[0]
df3 = pd.read_html(str(tables[2]))[0]
df4 = pd.read_html(str(tables[3]).replace("'1\"\'",'"1"'))[0] # avoided "ValueError: invalid literal for int() with base 10: '1"'

In [7]:
df = df1.append(df2.append(df3.append(df4,ignore_index=True),ignore_index=True),ignore_index=True)

In [8]:
df.tail()

Unnamed: 0,Opening,Opening.1,Title,Production company,Cast and crew,Ref.
267,DECEMBER,25,We Can Be Heroes,Netflix / Troublemaker Studios,Robert Rodriguez (director/screenplay); Priyan...,[240]
268,DECEMBER,25,News of the World,Universal Pictures / Perfect World Pictures,Paul Greengrass (director/screenplay); Luke Da...,[241]
269,DECEMBER,25,One Night in Miami...,Amazon Studios,Regina King (director); Kemp Powers (screenpla...,[242]
270,DECEMBER,25,Promising Young Woman,Focus Features / FilmNation Entertainment,Emerald Fennell (director/screenplay); Carey M...,[243]
271,DECEMBER,30,Pieces of a Woman,Netflix / Bron Studios,Kornél Mundruczó (director); Kata Wéber (scree...,[244]


In [9]:
df.shape

(272, 6)

In [10]:
df.columns

Index(['Opening', 'Opening.1', 'Title', 'Production company', 'Cast and crew',
       'Ref.'],
      dtype='object')

In [11]:
df.drop(["Opening","Opening.1","Production company","Ref."], axis=1, inplace =True)

#### Making call to TMDB API for genres

In [12]:
from tmdbv3api import TMDb
import json
import requests
tmdb = TMDb()
tmdb.api_key = "fc671a9683da1e18060546d03e4e88bd"

In [16]:
from tmdbv3api import Movie
tmdb_movie = Movie() 
def genre(x):
    genres = []
    result = tmdb_movie.search(x)
    if not result:
      return np.NaN
    else:
      movie_id = result[0].id
      response = requests.get('https://api.themoviedb.org/3/movie/{}?api_key={}'.format(movie_id,tmdb.api_key))
      data_json = response.json()
      if data_json['genres']:
          genre_str = " " 
          for i in range(0,len(data_json['genres'])):
              genres.append(data_json['genres'][i]['name'])
          return genre_str.join(genres)
      else:
          return np.NaN

In [17]:
df["genres"] = df["Title"].map(lambda x: genre(x))

In [18]:
df.head()

Unnamed: 0,Title,Cast and crew,genres
0,The Grudge,Nicolas Pesce (director/screenplay); Andrea Ri...,Horror Mystery Thriller
1,Underwater,"William Eubank (director); Brian Duffield, Ada...",Action Horror Science Fiction Thriller
2,Like a Boss,"Miguel Arteta (director); Sam Pitman, Adam Col...",Comedy
3,Three Christs,Jon Avnet (director/screenplay); Eric Nazarian...,Drama
4,Inherit the Viper,Anthony Jerjen (director); Andrew Crabtree (sc...,Drama Thriller Crime


In [19]:
x = df["Cast and crew"][0]
print(x)

Nicolas Pesce (director/screenplay); Andrea Riseborough, Demián Bichir, John Cho, Betty Gilpin, Lin Shaye, Jacki Weaver


In [20]:
def director(x):
    if " (director)" in x:
        return x.split(" (director)")[0]
    elif " (directors)" in x:
        return x.split(" (directors)")[0]
    else:
        return x.split(" (director/screenplay)")[0]

In [21]:
df["director_name"] = df["Cast and crew"].map(lambda x: director(x))

In [22]:
df.head()

Unnamed: 0,Title,Cast and crew,genres,director_name
0,The Grudge,Nicolas Pesce (director/screenplay); Andrea Ri...,Horror Mystery Thriller,Nicolas Pesce
1,Underwater,"William Eubank (director); Brian Duffield, Ada...",Action Horror Science Fiction Thriller,William Eubank
2,Like a Boss,"Miguel Arteta (director); Sam Pitman, Adam Col...",Comedy,Miguel Arteta
3,Three Christs,Jon Avnet (director/screenplay); Eric Nazarian...,Drama,Jon Avnet
4,Inherit the Viper,Anthony Jerjen (director); Andrew Crabtree (sc...,Drama Thriller Crime,Anthony Jerjen


In [23]:
x = df["Cast and crew"][0]
print(x)

Nicolas Pesce (director/screenplay); Andrea Riseborough, Demián Bichir, John Cho, Betty Gilpin, Lin Shaye, Jacki Weaver


In [24]:
def actors(x,num):
    casts = (x.split("screenplay); ")[-1]).split(", ")
    if casts == []:
        return np.NaN
    if num == 1:
        return (casts[0])
    elif num ==2:
        if len(casts)<=1:
            return np.NaN
        return (casts[1])
    elif num ==3:
        if  len(casts)<=2:
            return np.NaN
        else:
            return (casts[2])


In [25]:
df["actor_1_name"] = df["Cast and crew"].map(lambda x: actors(x,1))
df["actor_2_name"] = df["Cast and crew"].map(lambda x: actors(x,2))
df["actor_3_name"] = df["Cast and crew"].map(lambda x: actors(x,3))

In [26]:
df.drop("Cast and crew", axis=1, inplace =True)

In [27]:
df.tail(10)

Unnamed: 0,Title,genres,director_name,actor_1_name,actor_2_name,actor_3_name
262,Hunter Hunter,Horror Thriller,Shawn Linden,Devon Sawa,Camille Sullivan,Nick Stahl
263,"Ariana Grande: Excuse Me, I Love You",Music Documentary,Paul Dugdale,Paul Dugdale (director); Ariana Grande,,
264,Sylvie's Love,Drama Romance,Eugene Ashe,Tessa Thompson,Nnamdi Asomugha,Ryan Michelle Bathe
265,Wonder Woman 1984,Fantasy Action Adventure,Patty Jenkins,Gal Gadot,Chris Pine,Kristen Wiig
266,Soul,Family Animation Comedy Drama Music Fantasy,Pete Docter,Jamie Foxx,Tina Fey,Graham Norton
267,We Can Be Heroes,Action Fantasy Family Comedy,Robert Rodriguez,Priyanka Chopra Jonas,Pedro Pascal,YaYa Gosselin
268,News of the World,Action Adventure Drama Western,Paul Greengrass,Tom Hanks,Helena Zengel,
269,One Night in Miami...,Drama,Regina King,Kingsley Ben-Adir,Eli Goree,Aldis Hodge
270,Promising Young Woman,Thriller Crime Drama,Emerald Fennell,Carey Mulligan,Bo Burnham,Alison Brie
271,Pieces of a Woman,Drama,Kornél Mundruczó,Vanessa Kirby,Shia LaBeouf,Molly Parker


In [28]:
reqAttr = pd.read_csv("Dataset/reqAttr.csv")
reqAttr.head()

Unnamed: 0,movie_title,genres,director_name,actor_1_name,actor_2_name,actor_3_name
0,avatar,Action Adventure Fantasy Sci-Fi,James Cameron,CCH Pounder,Joel David Moore,Wes Studi
1,pirates of the caribbean: at world's end,Action Adventure Fantasy,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport
2,spectre,Action Adventure Thriller,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman
3,the dark knight rises,Action Thriller,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt
4,star wars: episode vii - the force awakens ...,Documentary,Doug Walker,Doug Walker,Rob Walker,unknown


In [29]:
df = df.rename(columns= {"Title":"movie_title" })

In [30]:
df["movie_title"] = df["movie_title"].apply(lambda x: x.lower())

In [31]:
df.head()

Unnamed: 0,movie_title,genres,director_name,actor_1_name,actor_2_name,actor_3_name
0,the grudge,Horror Mystery Thriller,Nicolas Pesce,Andrea Riseborough,Demián Bichir,John Cho
1,underwater,Action Horror Science Fiction Thriller,William Eubank,Kristen Stewart,Vincent Cassel,Jessica Henwick
2,like a boss,Comedy,Miguel Arteta,Tiffany Haddish,Rose Byrne,Salma Hayek
3,three christs,Drama,Jon Avnet,Richard Gere,Peter Dinklage,Walton Goggins
4,inherit the viper,Drama Thriller Crime,Anthony Jerjen,Josh Hartnett,Margarita Levieva,Chandler Riggs


In [32]:
reqAttr = reqAttr.append(df, ignore_index=True)

In [34]:
reqAttr.shape

(6698, 6)

In [33]:
for i in reqAttr.columns:
    print(f"{i} =|=====|= {reqAttr[i].isnull().sum()}")

movie_title =|=====|= 0
genres =|=====|= 1
director_name =|=====|= 0
actor_1_name =|=====|= 0
actor_2_name =|=====|= 4
actor_3_name =|=====|= 27


In [35]:
reqAttr = reqAttr.dropna(how='any')

In [36]:
for i in reqAttr.columns:
    print(f"{i} =|=====|= {reqAttr[i].isnull().sum()}")

movie_title =|=====|= 0
genres =|=====|= 0
director_name =|=====|= 0
actor_1_name =|=====|= 0
actor_2_name =|=====|= 0
actor_3_name =|=====|= 0


In [None]:
reqAttr.to_csv("Dataset/reqAttr.csv", index=False)

In [2]:
df = pd.read_csv("Dataset/reqAttr.csv")
df.head()

Unnamed: 0,movie_title,genres,director_name,actor_1_name,actor_2_name,actor_3_name,combined
0,avatar,Action Adventure Fantasy Sci-Fi,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Action Adventure Fantasy Sci-Fi James Cameron ...
1,pirates of the caribbean: at world's end,Action Adventure Fantasy,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Action Adventure Fantasy Gore Verbinski Johnny...
2,spectre,Action Adventure Thriller,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Action Adventure Thriller Sam Mendes Christoph...
3,the dark knight rises,Action Thriller,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Action Thriller Christopher Nolan Tom Hardy Ch...
4,star wars: episode vii - the force awakens ...,Documentary,Doug Walker,Doug Walker,Rob Walker,unknown,Documentary Doug Walker Doug Walker Rob Walker...


In [11]:
print("avatar" == df["movie_title"][0])
print("avatar" in df["movie_title"])
print(len(df))

True
False
6670


In [58]:
df.shape

(6670, 6)

## Add another column conating director name, 3 actor name and genre name  

In [59]:
df['combined'] = df["genres"]+" "+df["director_name"]+" "+df["actor_1_name"]+" "+df["actor_2_name"]+" "+df["actor_3_name"]

In [60]:
df.head()

Unnamed: 0,movie_title,genres,director_name,actor_1_name,actor_2_name,actor_3_name,combined
0,avatar,Action Adventure Fantasy Sci-Fi,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Action Adventure Fantasy Sci-Fi James Cameron ...
1,pirates of the caribbean: at world's end,Action Adventure Fantasy,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Action Adventure Fantasy Gore Verbinski Johnny...
2,spectre,Action Adventure Thriller,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Action Adventure Thriller Sam Mendes Christoph...
3,the dark knight rises,Action Thriller,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Action Thriller Christopher Nolan Tom Hardy Ch...
4,star wars: episode vii - the force awakens ...,Documentary,Doug Walker,Doug Walker,Rob Walker,unknown,Documentary Doug Walker Doug Walker Rob Walker...


In [61]:
df.to_csv("Dataset/reqAttr.csv", index=False)