In [1]:
import pandas as pd
import numpy as np
from tmdbv3api import TMDb
from tmdbv3api import Movie
import requests

In [2]:
df = pd.read_csv('../data/final.csv')

In [3]:
df

Unnamed: 0,movie_title,director_name,actor_1_name,actor_2_name,actor_3_name,genres
0,Avatar,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Action Adventure Fantasy Sci-Fi
1,Pirates of the Caribbean: At World's End,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Action Adventure Fantasy
2,Spectre,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Action Adventure Thriller
3,The Dark Knight Rises,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Action Thriller
4,Star Wars: Episode VII - The Force Awakens ...,Doug Walker,Doug Walker,Rob Walker,,Documentary
...,...,...,...,...,...,...
6970,Saw X,Kevin Greutert,Tobin Bell,Shawnee Smith,Synnøve Macody Lund,Crime Horror Thriller
6971,Fair Play,Chloe Domont,Phoebe Dynevor,Alden Ehrenreich,Eddie Marsan,Drama Thriller
6972,Heist 88,Menhaj Huda,Courtney B. Vance,Keesha Sharp,Keith David,Crime Drama
6973,The Re-Education of Molly Singer,Andy Palmer,Britt Robertson,Ty Simpkins,Nico Santos,Comedy


In [4]:
df['movie_title'] = df['movie_title'].str.strip() 

In [5]:
df.duplicated().sum()

124

In [6]:
df.drop_duplicates(subset='movie_title', keep='last', inplace=True)

In [7]:
df.duplicated().sum()

0

In [8]:
df.genres = df.genres.str.replace('Science Fiction', 'Sci-Fi')

In [9]:
df.isna().sum()

movie_title        0
director_name    102
actor_1_name      29
actor_2_name      99
actor_3_name     215
genres             8
dtype: int64

#### genres

In [10]:
API_KEY = '4c225c866aa84f4ef450043654ec9e56'
tmdb = TMDb()
tmdb.api_key = API_KEY
movie = Movie()


def get_genres(x):
    search = movie.search(x)
    film_id = search[0].id

    url = f'https://api.themoviedb.org/3/movie/{film_id}?api_key={API_KEY}'
    res = requests.get(url)
    data = res.json()

    if data.get('genres', None):
        genres = []
        for i in data['genres']:
            genres.append(i['name'])
        if genres == []:
            return np.NaN
        return " ".join(genres)

In [11]:
df.loc[df.genres.isna(),'genres'] = df.loc[df.genres.isna(), 'movie_title'].apply(get_genres)

In [12]:
df.isna().sum()

movie_title        0
director_name    102
actor_1_name      29
actor_2_name      99
actor_3_name     215
genres             0
dtype: int64

#### director_name

In [13]:
def get_directors(x):
    try:
        search = movie.search(x)
        film_id = search[0].id
        url = f'https://api.themoviedb.org/3/movie/{film_id}/credits?api_key={API_KEY}'
        res = requests.get(url)
        data = res.json()

        for person in data['crew']:
            if person.get('job', None) == "Director":
                return person['name']
        return np.NaN
    except:
        return np.NaN

In [14]:
df.loc[df.director_name.isna(), 'director_name'] = df.loc[df.director_name.isna(), 'movie_title'].apply(get_directors)

In [15]:
df.isna().sum()

movie_title        0
director_name      9
actor_1_name      29
actor_2_name      99
actor_3_name     215
genres             0
dtype: int64

In [16]:
def get_actors_by_num(x, num):
    try:
        search = movie.search(x)
        film_id = search[0].id
        url = f'https://api.themoviedb.org/3/movie/{film_id}/credits?api_key={API_KEY}'
        res = requests.get(url)
        data = res.json()

        if len(data['cast']) < num:
            return np.NaN
        return data['cast'][num - 1]['name']

    except:
        return np.NaN

In [17]:
df.loc[df.actor_1_name.isna(), 'actor_1_name'] = df.loc[df.actor_1_name.isna(), 'movie_title'].apply(lambda x: get_actors_by_num(x, 1))
df.loc[df.actor_2_name.isna(), 'actor_2_name'] = df.loc[df.actor_2_name.isna(), 'movie_title'].apply(lambda x: get_actors_by_num(x, 2))
df.loc[df.actor_3_name.isna(), 'actor_3_name'] = df.loc[df.actor_3_name.isna(), 'movie_title'].apply(lambda x: get_actors_by_num(x, 3))

In [18]:
df.isna().sum()

movie_title       0
director_name     9
actor_1_name     14
actor_2_name     49
actor_3_name     71
genres            0
dtype: int64

In [19]:
df.dropna(how='any', inplace=True)

In [20]:
df.isna().sum()

movie_title      0
director_name    0
actor_1_name     0
actor_2_name     0
actor_3_name     0
genres           0
dtype: int64

In [21]:
df.director_name = df.director_name.str.replace(',', '')

In [22]:
df.reset_index(drop=True, inplace=True)

In [23]:
df

Unnamed: 0,movie_title,director_name,actor_1_name,actor_2_name,actor_3_name,genres
0,Avatar,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Action Adventure Fantasy Sci-Fi
1,Pirates of the Caribbean: At World's End,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Action Adventure Fantasy
2,Spectre,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Action Adventure Thriller
3,The Dark Knight Rises,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Action Thriller
4,Star Wars: Episode VII - The Force Awakens,Doug Walker,Doug Walker,Rob Walker,Carrie Fisher,Documentary
...,...,...,...,...,...,...
6713,Saw X,Kevin Greutert,Tobin Bell,Shawnee Smith,Synnøve Macody Lund,Crime Horror Thriller
6714,Fair Play,Chloe Domont,Phoebe Dynevor,Alden Ehrenreich,Eddie Marsan,Drama Thriller
6715,Heist 88,Menhaj Huda,Courtney B. Vance,Keesha Sharp,Keith David,Crime Drama
6716,The Re-Education of Molly Singer,Andy Palmer,Britt Robertson,Ty Simpkins,Nico Santos,Comedy


In [24]:
df['movie_title'] = df['movie_title'].str.lower()

In [25]:
df['comb'] = df['actor_1_name'] + ', ' + \
             df['actor_2_name'] + ', ' + \
             df['actor_3_name'] + ', ' + \
             df['director_name'] + ', ' + \
             df['genres'].str.replace(' ', ', ')

In [26]:
df

Unnamed: 0,movie_title,director_name,actor_1_name,actor_2_name,actor_3_name,genres,comb
0,avatar,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Action Adventure Fantasy Sci-Fi,"CCH Pounder, Joel David Moore, Wes Studi, Jame..."
1,pirates of the caribbean: at world's end,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Action Adventure Fantasy,"Johnny Depp, Orlando Bloom, Jack Davenport, Go..."
2,spectre,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Action Adventure Thriller,"Christoph Waltz, Rory Kinnear, Stephanie Sigma..."
3,the dark knight rises,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Action Thriller,"Tom Hardy, Christian Bale, Joseph Gordon-Levit..."
4,star wars: episode vii - the force awakens,Doug Walker,Doug Walker,Rob Walker,Carrie Fisher,Documentary,"Doug Walker, Rob Walker, Carrie Fisher, Doug W..."
...,...,...,...,...,...,...,...
6713,saw x,Kevin Greutert,Tobin Bell,Shawnee Smith,Synnøve Macody Lund,Crime Horror Thriller,"Tobin Bell, Shawnee Smith, Synnøve Macody Lund..."
6714,fair play,Chloe Domont,Phoebe Dynevor,Alden Ehrenreich,Eddie Marsan,Drama Thriller,"Phoebe Dynevor, Alden Ehrenreich, Eddie Marsan..."
6715,heist 88,Menhaj Huda,Courtney B. Vance,Keesha Sharp,Keith David,Crime Drama,"Courtney B. Vance, Keesha Sharp, Keith David, ..."
6716,the re-education of molly singer,Andy Palmer,Britt Robertson,Ty Simpkins,Nico Santos,Comedy,"Britt Robertson, Ty Simpkins, Nico Santos, And..."


In [27]:
df.to_csv('../data/final_data.csv', index=False)