# Using NLP (keyword matching) to predict genre from the overview

As you can see from the graph below, we have a significant number of missing genre values. So, we will use keyword matching to predict the genre. This is first out of 3 steps - (1) classical NLP, (2) LLM like BERT, (3) using a preexisting LLM API openai or an opensource LLM from HuggingFace 

In [None]:
import pandas as pd
import os
from IPython.display import display
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [None]:
TMDB_filename = os.path.join(os.getcwd(), "TMDB_tv_dataset_v3.csv")
df = pd.read_csv(TMDB_filename)

In [None]:
print(df.isnull().sum())

### Loading a spaCy model 

spaCY provides natural language processing (NLP) functionalities, such as tokenization, stopword removal, and lemmatization

In [None]:
import spacy

# Print the number of missing genres with an available overview
missing_genres = df[df['genres'].isnull() & df['overview'].notnull()]
print(f"Missing genre values with an overview: {len(missing_genres)}")

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")


### Preprocessing 

In [None]:
'''
# Preprocess function: tokenize, remove stopwords, and lemmatize
def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(tokens)

# Apply preprocessing to all rows (not just missing genres)
df['cleaned_overview'] = df['overview'].apply(preprocess_text)
'''

### keyword matching 

In [None]:
# Define a dictionary with genres and corresponding keywords
# Define a dictionary with genres and corresponding keywords
genre_keywords = {
    'Action & Adventure': ['fight', 'battle', 'war', 'hero', 'explosion', 'adventure', 'quest', 'journey'],
    'Animation': ['animated', 'cartoon', 'drawing', 'anime', 'cgi', 'animation', 'pixar', 'disney'],
    'Comedy': ['funny', 'humor', 'comedy', 'joke', 'laugh', 'parody', 'satire', 'hilarious'],
    'Crime': ['detective', 'murder', 'police', 'criminal', 'robbery', 'heist', 'gang', 'crime'],
    'Documentary': ['documentary', 'true story', 'biography', 'real events', 'non-fiction'],
    'Drama': ['drama', 'emotional', 'serious', 'family', 'relationship', 'conflict'],
    'Family': ['family', 'kids', 'parent', 'children', 'home', 'love', 'domestic'],
    'History': ['historical', 'past', 'medieval', 'war', 'ancient', 'empire', 'revolution'],
    'Kids': ['children', 'kids', 'young', 'fun', 'adventure', 'family'],
    'Music': ['music', 'band', 'singer', 'song', 'concert', 'performance'],
    'Musical': ['musical', 'song', 'dance', 'performance', 'broadway', 'theatre', 'show'],
    'Mystery': ['mystery', 'detective', 'crime', 'whodunit', 'puzzle', 'clue', 'suspense'],
    'News': ['news', 'breaking', 'headline', 'report', 'journalist', 'anchor', 'current events'],
    'Reality': ['reality', 'real life', 'contest', 'competition', 'unscripted', 'reality show'],
    'Romance': ['love', 'romantic', 'relationship', 'heart', 'affair', 'couple'],
    'Sci-Fi & Fantasy': ['space', 'alien', 'robot', 'futuristic', 'time travel', 'fantasy', 'magic', 'dragon'],
    'Soap': ['soap opera', 'drama', 'romance', 'affairs', 'family', 'betrayal', 'melodrama'],
    'Talk': ['talk show', 'interview', 'discussion', 'host', 'celebrity', 'guest', 'panel'],
    'Unknown': ['unknown', 'undefined', 'not classified', 'unidentified', 'other'],
    'War & Politics': ['war', 'battle', 'soldier', 'politics', 'government', 'military', 'conflict', 'revolution'],
    'Western': ['cowboy', 'western', 'sheriff', 'outlaw', 'desert', 'gunfight', 'frontier', 'wild west']
}

#having multiple genres - what to do? 
'''
# Function to assign genre based on keywords
def assign_genre(text):
    for genre, keywords in genre_keywords.items():
        for keyword in keywords:
            if keyword in text:
                return genre
    return 'Unknown'  # If no match is found
'''

In [None]:
# Modify the function to allow multiple genres and handle NaN values
def assign_genres(text):
    if pd.isna(text):  # Check if the text is NaN
        return ['Unknown']  # Return 'Unknown' for NaN values
    
    matched_genres = []
    for genre, keywords in genre_keywords.items():
        if any(keyword in text for keyword in keywords):
            matched_genres.append(genre)
    return matched_genres if matched_genres else ['Unknown']  # Return 'Unknown' if no genre matches

# Apply the function and join predicted genres with commas
df['predicted_genres'] = df['cleaned_overview'].apply(assign_genres)

# Fill missing genre values with the predicted genres, joined by commas
df['genres'] = df['genres'].fillna(df['predicted_genres'].apply(lambda x: ', '.join(x)))

# Save the updated DataFrame to a new CSV file
df.to_csv('updated_dataset_with_multigenre.csv', index=False)

# Check the final few rows to see how genres have been filled
print(df[['overview', 'genres', 'predicted_genres']].tail())


### Applying the Function & Calculating Accuracy 

In [None]:
'''
# Apply preprocessing to all rows (not just missing genres)
df['cleaned_overview'] = df['overview'].apply(preprocess_text)
'''

# Predict genres for all rows (even the ones that already have genres)
df['predicted_genres'] = df['cleaned_overview'].apply(assign_genres)

# Check how many predicted genres match the actual genres
df['match'] = df['genres'] == df['predicted_genres']

# Calculate the accuracy for rows where the actual genre exists
accuracy = df[df['genres'].notnull()]['match'].mean()
print(f"Prediction accuracy: {accuracy * 100:.2f}%")


In [None]:
#Next Step: Finetuning 
#Adjust Keywords as needed based on accuracy 

### Fill missing genres and save 

In [None]:
# Only fill missing genre values with the predicted genre
df['genres'] = df['genres'].fillna(df['predicted_genres'])

# Save the updated dataframe to a new CSV file
df.to_csv('updated_dataset.csv', index=False)

# Check the final few rows to see if genres have been filled
print(df[['overview', 'genres', 'predicted_genres']].tail())
