# Using NLP (keyword matching) to predict genre from the overview

As you can see from the graph below, we have a significant number of missing genre values. So, we will use keyword matching to predict the genre. This is first out of 3 steps - (1) classical NLP, (2) LLM like BERT, (3) using a preexisting LLM API openai or an opensource LLM from HuggingFace 

In [1]:
import pandas as pd
import os
from IPython.display import display
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
TMDB_filename = os.path.join(os.getcwd(), "TMDB_tv_dataset_v3.csv")
df = pd.read_csv(TMDB_filename)

In [3]:
print(df.isnull().sum())

id                           0
name                         5
number_of_seasons            0
number_of_episodes           0
original_language            0
vote_count                   0
vote_average                 0
overview                 75306
adult                        0
backdrop_path            90859
first_air_date           31736
last_air_date            29904
homepage                117641
in_production                0
original_name                5
popularity                   0
poster_path              59902
type                         0
status                       0
tagline                 163309
genres                   68926
created_by              132143
languages                58589
networks                 71050
origin_country           31030
spoken_languages         59359
production_companies    109297
production_countries     91128
episode_run_time             0
cleaned_overview         75386
dtype: int64


### Loading a spaCy model 

spaCY provides natural language processing (NLP) functionalities, such as tokenization, stopword removal, and lemmatization

In [4]:
'''
import spacy

# Print the number of missing genres with an available overview
missing_genres = df[df['genres'].isnull() & df['overview'].notnull()]
print(f"Missing genre values with an overview: {len(missing_genres)}")

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")
'''

'\nimport spacy\n\n# Print the number of missing genres with an available overview\nmissing_genres = df[df[\'genres\'].isnull() & df[\'overview\'].notnull()]\nprint(f"Missing genre values with an overview: {len(missing_genres)}")\n\n# Load the spaCy model\nnlp = spacy.load("en_core_web_sm")\n'

### Preprocessing 

In [5]:
'''
# Preprocess function: tokenize, remove stopwords, and lemmatize
def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(tokens)

# Apply preprocessing to all rows (not just missing genres)
df['cleaned_overview'] = df['overview'].apply(preprocess_text)
'''

'\n# Preprocess function: tokenize, remove stopwords, and lemmatize\ndef preprocess_text(text):\n    doc = nlp(text)\n    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]\n    return " ".join(tokens)\n\n# Apply preprocessing to all rows (not just missing genres)\ndf[\'cleaned_overview\'] = df[\'overview\'].apply(preprocess_text)\n'

### keyword matching 

In [6]:
# Define a dictionary with genres and corresponding keywords
# Define a dictionary with genres and corresponding keywords
genre_keywords = {
    'Action & Adventure': ['fight', 'battle', 'war', 'hero', 'explosion', 'adventure', 'quest', 'journey'],
    'Animation': ['animate', 'cartoon', 'draw', 'anime', 'cgi', 'animate', 'pixar', 'disney'],
    'Comedy': ['funny', 'humor', 'comedy', 'joke', 'laugh', 'parody', 'satire', 'hilarious'],
    'Crime': ['detective', 'murder', 'police', 'criminal', 'robbery', 'heist', 'gang', 'crime'],
    'Documentary': ['documentary', 'true story', 'biography', 'real event', 'non-fiction'],
    'Drama': ['drama', 'emotional', 'serious', 'family', 'relationship', 'conflict'],
    'Family': ['family', 'kid', 'parent', 'child', 'home', 'love', 'domestic'],
    'History': ['historical', 'past', 'medieval', 'war', 'ancient', 'empire', 'revolution'],
    'Kids': ['child', 'kid', 'young', 'fun', 'adventure', 'family'],
    'Music': ['music', 'band', 'singer', 'song', 'concert', 'performance'],
    'Musical': ['musical', 'song', 'dance', 'performance', 'broadway', 'theatre', 'show'],
    'Mystery': ['mystery', 'detective', 'crime', 'whodunit', 'puzzle', 'clue', 'suspense'],
    'News': ['news', 'breaking', 'headline', 'report', 'journalist', 'anchor', 'current event'],
    'Reality': ['reality', 'real life', 'contest', 'competition', 'unscripted', 'reality show'],
    'Romance': ['love', 'romantic', 'relationship', 'heart', 'affair', 'couple'],
    'Sci-Fi & Fantasy': ['space', 'alien', 'robot', 'futuristic', 'time travel', 'fantasy', 'magic', 'dragon'],
    'Soap': ['soap opera', 'drama', 'romance', 'affair', 'family', 'betrayal', 'melodrama'],
    'Talk': ['talk show', 'interview', 'discussion', 'host', 'celebrity', 'guest', 'panel'],
    'Unknown': ['unknown', 'undefined', 'not classified', 'unidentified', 'other'],
    'War & Politics': ['war', 'battle', 'soldier', 'politics', 'government', 'military', 'conflict', 'revolution'],
    'Western': ['cowboy', 'western', 'sheriff', 'outlaw', 'desert', 'gunfight', 'frontier', 'wild west']
}

#having multiple genres - what to do? 

# Function to assign genre based on keywords
def assign_genre(text):
    if isinstance(text, str):  
        for genre, keywords in genre_keywords.items():
            for keyword in keywords:
                if keyword in text:
                    return genre
    return 'Unknown'  # If no match is found

### Applying the Function & Calculating Accuracy 

In [7]:
# Predict genres for all rows (even the ones that already have genres)
df['predicted_genres'] = df['cleaned_overview'].apply(assign_genre)

# Check how many predicted genres match the actual genres
df['match'] = df['genres'] == df['predicted_genres']

# Calculate the accuracy for rows where the actual genre exists
accuracy = df[df['genres'].notnull()]['match'].mean()
print(f"Prediction accuracy: {accuracy * 100:.2f}%")

Prediction accuracy: 7.62%


In [8]:
df.columns

Index(['id', 'name', 'number_of_seasons', 'number_of_episodes',
       'original_language', 'vote_count', 'vote_average', 'overview', 'adult',
       'backdrop_path', 'first_air_date', 'last_air_date', 'homepage',
       'in_production', 'original_name', 'popularity', 'poster_path', 'type',
       'status', 'tagline', 'genres', 'created_by', 'languages', 'networks',
       'origin_country', 'spoken_languages', 'production_companies',
       'production_countries', 'episode_run_time', 'cleaned_overview',
       'predicted_genres', 'match'],
      dtype='object')

In [9]:
#Next Step: Finetuning 
#Adjust Keywords as needed based on accuracy 

### Fill missing genres and save 

In [10]:
# Only fill missing genre values with the predicted genre
df['genres'] = df['genres'].fillna(df['predicted_genres'])

# Save the updated dataframe to a new CSV file
df.to_csv("TMDB_tv_dataset_v3.csv", index=False)

# Check the final few rows to see if genres have been filled
print(df[['overview', 'genres', 'predicted_genres']].tail())


                                                 overview   genres  \
168634                                                NaN  Unknown   
168635                                                NaN  Unknown   
168636  Murder, art and a journalist's relentless ques...    Crime   
168637  A college student gets into trouble when she m...  Unknown   
168638                 Short-lives series on Youtube Red.  Unknown   

          predicted_genres  
168634             Unknown  
168635             Unknown  
168636  Action & Adventure  
168637             Unknown  
168638             Unknown  
