In [2]:
import pandas as pd
from tmdbv3api import TMDb
from tmdbv3api import Movie
import numpy as np
import requests

Extract films released from 2018 to 2023 from wikipedia

In [3]:

years = [2018, 2019, 2020, 2021, 2022, 2023]
df = pd.DataFrame()
for year in years:

    url = f'https://en.wikipedia.org/wiki/List_of_American_films_of_{year}'
    tables = pd.read_html(url)

    # we are interested in 3 to 6 tables they contain the information we need
    for i in range(3, 6):
        tables[i] = tables[i][['Title', 'Cast and crew']]
        df = pd.concat([df, tables[i]], ignore_index=True)

In [4]:
df

Unnamed: 0,Title,Cast and crew
0,A Quiet Place,John Krasinski (director/screenplay); Scott Be...
1,Blockers,"Kay Cannon (director); Eben Russell, Jon Hurwi..."
2,You Were Never Really Here,Lynne Ramsay (director/screenplay); Joaquin Ph...
3,Chappaquiddick,"John Curran (director); Taylor Allen, Andrew L..."
4,Pandas,"Drew Fellman, David Douglas (directors); Drew ..."
...,...,...
1406,Fair Play,Chloe Domont (director/screenplay); Phoebe Dyn...
1407,Heist 88,Menhaj Huda (director); Dwayne Johnson-Cochran...
1408,The Re-Education of Molly Singer,"Andy Palmer (director); Todd Friedman, Kevin H..."
1409,Deliver Us,"Lee Roy Kunz, Cru Ennis (directors); Lee Roy K..."


In [5]:
df.isna().sum()

Title            6
Cast and crew    6
dtype: int64

In [6]:
df.dropna(how='any', inplace=True)

In [7]:
API_KEY = ''
tmdb = TMDb()
tmdb.api_key = API_KEY
movie = Movie()

def get_genres(name):
    try:
        search = movie.search(name)
        film_id = search[0].id
        url = f"https://api.themoviedb.org/3/movie/{film_id}?api_key={API_KEY}"
        res = requests.get(url)
        data = res.json()

        if data.get('genres', None):
            genres = []
            for i in data['genres']:
                genres.append(i['name'])
            return " ".join(genres)
        
        return np.NaN
    except:
        return np.NaN

In [8]:
df['genres'] = df['Title'].apply(lambda x: get_genres(str(x)))

In [9]:
df.head()

Unnamed: 0,Title,Cast and crew,genres
0,A Quiet Place,John Krasinski (director/screenplay); Scott Be...,Horror Drama Science Fiction
1,Blockers,"Kay Cannon (director); Eben Russell, Jon Hurwi...",Comedy
2,You Were Never Really Here,Lynne Ramsay (director/screenplay); Joaquin Ph...,Crime Drama Thriller
3,Chappaquiddick,"John Curran (director); Taylor Allen, Andrew L...",History Drama Thriller
4,Pandas,"Drew Fellman, David Douglas (directors); Drew ...",Documentary


In [10]:
df['Cast and crew'].iloc[0]

'John Krasinski (director/screenplay); Scott Beck, Bryan Woods (screenplay); John Krasinski, Emily Blunt, Millicent Simmonds, Noah Jupe'

In [11]:
def get_director(x):
    if '(director/screenplay)' in x:
        return x.split(' (director/screenplay)')[0]
    elif '(director)' in x:
        return x.split(' (director)')[0]
    else:
        return x.split(' (directors)')[0]

In [12]:
df['director_name'] = df['Cast and crew'].apply(get_director)

In [13]:
def get_actor_by_num(x, num):
    actors = x.split('; ')[-1].split(', ')
    if len(actors) < num:
        return np.NaN
    return actors[num - 1]

In [14]:
df['actor_1_name'] = df['Cast and crew'].apply(lambda x: get_actor_by_num(x, 1))
df['actor_2_name'] = df['Cast and crew'].apply(lambda x: get_actor_by_num(x, 2))
df['actor_3_name'] = df['Cast and crew'].apply(lambda x: get_actor_by_num(x, 3))

In [17]:
df['Cast and crew'].iloc[4]

'Drew Fellman, David Douglas (directors); Drew Fellman (screenplay); Kristen Bell'

In [18]:
df = df.rename(columns={'Title':'movie_title'})

In [19]:
df

Unnamed: 0,movie_title,Cast and crew,genres,director_name,actor_1_name,actor_2_name,actor_3_name
0,A Quiet Place,John Krasinski (director/screenplay); Scott Be...,Horror Drama Science Fiction,John Krasinski,John Krasinski,Emily Blunt,Millicent Simmonds
1,Blockers,"Kay Cannon (director); Eben Russell, Jon Hurwi...",Comedy,Kay Cannon,Leslie Mann,Ike Barinholtz,John Cena
2,You Were Never Really Here,Lynne Ramsay (director/screenplay); Joaquin Ph...,Crime Drama Thriller,Lynne Ramsay,Joaquin Phoenix,Ekaterina Samsonov,Alex Manette
3,Chappaquiddick,"John Curran (director); Taylor Allen, Andrew L...",History Drama Thriller,John Curran,Jason Clarke,Kate Mara,Ed Helms
4,Pandas,"Drew Fellman, David Douglas (directors); Drew ...",Documentary,"Drew Fellman, David Douglas",Kristen Bell,,
...,...,...,...,...,...,...,...
1405,Saw X,"Kevin Greutert (director); Josh Stolberg, Pete...",Crime Horror Thriller,Kevin Greutert,Tobin Bell,Shawnee Smith,Synnøve Macody Lund
1406,Fair Play,Chloe Domont (director/screenplay); Phoebe Dyn...,Drama Thriller,Chloe Domont,Phoebe Dynevor,Alden Ehrenreich,Eddie Marsan
1407,Heist 88,Menhaj Huda (director); Dwayne Johnson-Cochran...,Crime Drama,Menhaj Huda,Courtney B. Vance,Keesha Sharp,Keith David
1408,The Re-Education of Molly Singer,"Andy Palmer (director); Todd Friedman, Kevin H...",Comedy,Andy Palmer,Britt Robertson,Ty Simpkins,Nico Santos


In [23]:
df.columns

Index(['movie_title', 'Cast and crew', 'genres', 'director_name',
       'actor_1_name', 'actor_2_name', 'actor_3_name'],
      dtype='object')

In [24]:
df = df[['movie_title', 'genres', 'director_name','actor_1_name', 'actor_2_name', 'actor_3_name']]

In [25]:
old = pd.read_csv('../data/new_data.csv')

In [26]:
old

Unnamed: 0,movie_title,director_name,actor_1_name,actor_2_name,actor_3_name,genres
0,Avatar,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Action Adventure Fantasy Sci-Fi
1,Pirates of the Caribbean: At World's End,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Action Adventure Fantasy
2,Spectre,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Action Adventure Thriller
3,The Dark Knight Rises,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Action Thriller
4,Star Wars: Episode VII - The Force Awakens ...,Doug Walker,Doug Walker,Rob Walker,,Documentary
...,...,...,...,...,...,...
5569,thick lashes of lauri mäntyvaara,Hannaleena Hauru,Inka Haapamäki,Rosa Honkonen,Tiitus Rantala,Romance Comedy
5570,cop and a half: new recruit,Jonathan A. Rosenbaum,Lou Diamond Phillips,Wallace Shawn,Gina Holden,Crime Comedy Action Family
5571,in a heartbeat,Beth David Esteban Bravo,,,,Family Animation Romance Comedy
5572,mom,Ravi Udyawar,Sridevi Kapoor,Sajal Ali,Akshaye Khanna,Crime Drama Thriller


In [27]:
new = pd.concat([old, df], ignore_index=True)

In [28]:
new.shape

(6979, 6)

In [29]:
new.to_csv('../data/final.csv', index=False)