In [27]:
import numpy as np
import urllib3
import json
import tmdbsimple as tmdb
import time
from itertools import combinations

In [28]:
# API key of TMDB
api_key = '5de45bb6ad138ff637656291961cbeeb'

# Filling the attributes
tmdb.API_KEY = api_key
search = tmdb.Search()

In [29]:
# Returns the information of movies
def get_movie_info(movie):
    
    # Getting the details of the movie in JSON format
    details = search.movie(query = movie)
    
    # Obtaining the movie's id
    movie_id = details['results'][0]['id']
    
    # Returning the required details of the movie
    movie = tmdb.Movies(movie_id)
    return movie.info()

In [30]:
# Extracting movies' data
all_movies = tmdb.Movies()

In [31]:
# Obtaining the list of all genres
genres = tmdb.Genres()
genres_list = genres.list()['genres']

# Creating a dictionary of id values and movies genres
dict_genre_ids = {}

for x in genres_list:
    dict_genre_ids[x['id']] = x['name']

# Dictionary of id values and movie genres
dict_genre_ids

{28: 'Action',
 12: 'Adventure',
 16: 'Animation',
 35: 'Comedy',
 80: 'Crime',
 99: 'Documentary',
 18: 'Drama',
 10751: 'Family',
 14: 'Fantasy',
 36: 'History',
 27: 'Horror',
 10402: 'Music',
 9648: 'Mystery',
 10749: 'Romance',
 878: 'Science Fiction',
 10770: 'TV Movie',
 53: 'Thriller',
 10752: 'War',
 37: 'Western'}

In [32]:
# Extracting the top 1000 movies
top_movies = []
genres_of_movies = []
movies_overviews = []

# Number of pages the data is going to be extracted from
max_pages = 51

# Extracting movies from 50 pages
for i in range(1, max_pages):
    
    # Top movies from 'i'th page
    top_movies_page = all_movies.popular(page = i)
    top_movies_page_results = top_movies_page['results']
    
    # Extracting information of all the movies in this page
    for j in top_movies_page_results:
        
        # Finding the overview of the movie
        overview = j['overview']
        
        # If there is no overview for a movie, move on to the next one
        if overview == '':
            continue
        
        top_movies.append(j['original_title'])
        movies_overviews.append(overview)
        
        # Getting the genre ids of the movie
        genres_list = j['genre_ids']
        
        # To store the genres of the movie
        g_list = []
        
        # Storing the genre names in a list
        for k in genres_list:
            g_list.append(dict_genre_ids[k])
        
        # Appending the genre names in the main list
        genres_of_movies.append(g_list)
    
    # Giving the API request some rest
    if i % 10 == 0:
        time.sleep(1)

In [33]:
# This is the dataset
# Finding all the movies with their genres
for i, j in zip(top_movies, genres_of_movies):
    print(i, j)

Zack Snyder's Justice League ['Action', 'Adventure', 'Fantasy', 'Science Fiction']
Godzilla vs. Kong ['Action', 'Science Fiction']
승리호 ['Drama', 'Fantasy', 'Science Fiction']
Raya and the Last Dragon ['Animation', 'Adventure', 'Fantasy', 'Family', 'Action']
Sentinelle ['Thriller', 'Action', 'Drama']
Tom & Jerry ['Action', 'Comedy', 'Family']
Wonder Woman 1984 ['Fantasy', 'Action', 'Adventure']
Monster Hunter ['Fantasy', 'Action', 'Adventure']
Bajocero ['Action', 'Crime', 'Thriller']
Cherry ['Crime', 'Drama']
Outside the Wire ['Thriller', 'Action', 'Science Fiction']
Black Water: Abyss ['Horror', 'Thriller', 'Adventure', 'Mystery']
Red Dot ['Drama', 'Thriller']
Godzilla: King of the Monsters ['Science Fiction', 'Action']
The Little Things ['Thriller', 'Crime']
Breach ['Science Fiction', 'Action']
King Kong vs. Godzilla ['Science Fiction', 'Action', 'Adventure', 'Fantasy']
Miraculous World: New York, United HeroeZ ['Animation', 'Family']
Mortal Kombat Legends: Scorpion's Revenge ['Fantas

In [34]:
# Modyfing the dictionary into a list
# It contains all the genres in some indices
list_of_genres = []

for i in dict_genre_ids:
    list_of_genres.append(dict_genre_ids[i])

In [35]:
# To filter the overviews of movies
content = []

# Filtering the text in the movie overview
for i in movies_overviews:
    
    overview = i.replace(',', '')
    overview = overview.replace('.', '')
    
    # Updating the filtered overview in another list
    content.append(overview)

# Re-assigning the reference
movies_overviews = content

In [36]:
len(top_movies), len(movies_overviews), len(genres_of_movies)

(992, 992, 992)

In [37]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

from sklearn.multiclass import OneVsRestClassifier

In [38]:
# Converting the genres into binary vectors
multi_label_binarizer = MultiLabelBinarizer()
y = multi_label_binarizer.fit_transform(genres_of_movies)

In [39]:
# Genres of movies in the form of binary vectors
print(y)
# Set of all the genres
print(multi_label_binarizer.classes_)

[[1 1 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [1 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
['Action' 'Adventure' 'Animation' 'Comedy' 'Crime' 'Documentary' 'Drama'
 'Family' 'Fantasy' 'History' 'Horror' 'Music' 'Mystery' 'Romance'
 'Science Fiction' 'TV Movie' 'Thriller' 'War' 'Western']


In [40]:
tfidf_vectorizer = TfidfVectorizer(analyzer = 'word')
X = tfidf_vectorizer.fit_transform(movies_overviews)
X

<992x8129 sparse matrix of type '<class 'numpy.float64'>'
	with 37913 stored elements in Compressed Sparse Row format>

In [41]:
# tfidf_vectorizer.vocabulary_

In [42]:
X.shape, y.shape

((992, 8129), (992, 19))

In [43]:
# Splitting the training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [44]:
parameters = {'kernel':['linear'], 'C':[0.01, 0.1, 1.0]}
gridCV = GridSearchCV(SVC(class_weight='balanced'), parameters, scoring=make_scorer(f1_score, average='micro'))
classif = OneVsRestClassifier(gridCV)

classif.fit(X_train, y_train)

OneVsRestClassifier(estimator=GridSearchCV(estimator=SVC(class_weight='balanced'),
                                           param_grid={'C': [0.01, 0.1, 1.0],
                                                       'kernel': ['linear']},
                                           scoring=make_scorer(f1_score, average=micro)))

In [45]:
res = classif.predict(X_test)

In [47]:
print(classification_report(y_test, res, target_names = list_of_genres))

                 precision    recall  f1-score   support

         Action       0.79      0.68      0.73       153
      Adventure       0.62      0.50      0.55       100
      Animation       0.78      0.34      0.48        73
         Comedy       0.52      0.30      0.38        79
          Crime       0.50      0.03      0.06        32
    Documentary       0.00      0.00      0.00         9
          Drama       0.53      0.23      0.32        90
         Family       0.77      0.29      0.42        58
        Fantasy       0.64      0.25      0.36        64
        History       0.00      0.00      0.00        10
         Horror       0.36      0.12      0.19        40
          Music       0.00      0.00      0.00         6
        Mystery       0.00      0.00      0.00        19
        Romance       0.33      0.04      0.06        28
Science Fiction       0.65      0.44      0.52        78
       TV Movie       0.00      0.00      0.00         5
       Thriller       0.57    