## Filling the missing NA of primary genres

In [1]:
import pandas as pd

shows = pd.read_csv('shows_cleaned.csv')

In [2]:
shows['genre_list'].value_counts().reset_index()

Unnamed: 0,genre_list,count
0,[],67024
1,['Documentary'],17490
2,['Drama'],16114
3,['Comedy'],10194
4,['Reality'],7953
...,...,...
2210,"['Animation', 'Animation', 'Mystery', 'Crime']",1
2211,"['Animation', 'Action & Adventure', 'Comedy', ...",1
2212,"['Family', 'Kids', 'Animation', 'Mystery']",1
2213,"['Kids', 'News', 'Talk']",1


In [3]:
import ast
unique_genres = set()

for genres in shows['genre_list'].dropna():
    if isinstance(genres, str):
        genres = ast.literal_eval(genres)
    unique_genres.update(genres)

unique_genres_list = sorted(unique_genres)

unique_genres_list

['Action & Adventure',
 'Animation',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'History',
 'Kids',
 'Music',
 'Musical',
 'Mystery',
 'News',
 'Reality',
 'Romance',
 'Sci-Fi & Fantasy',
 'Soap',
 'Talk',
 'War & Politics',
 'Western']

In [4]:
from transformers import pipeline

# load zero-shot classification model
pipe = pipeline("zero-shot-classification",
                model='facebook/bart-large-mnli',
                device=0)



Device set to use cuda:0


In [5]:
sequence = shows.loc[0, 'overview']

In [6]:
pipe(sequence, unique_genres_list, multi_label=True)

{'sequence': "Seven noble families fight for control of the mythical land of Westeros. Friction between the houses leads to full-scale war. All while a very ancient evil awakens in the farthest north. Amidst the war, a neglected military order of misfits, the Night's Watch, is all that stands between the realms of men and icy horrors beyond.",
 'labels': ['Family',
  'Sci-Fi & Fantasy',
  'War & Politics',
  'Drama',
  'Action & Adventure',
  'News',
  'Talk',
  'Mystery',
  'History',
  'Documentary',
  'Crime',
  'Reality',
  'Music',
  'Romance',
  'Animation',
  'Kids',
  'Western',
  'Musical',
  'Comedy',
  'Soap'],
 'scores': [0.9313384890556335,
  0.8475427031517029,
  0.7952824234962463,
  0.7031816244125366,
  0.678015410900116,
  0.35945776104927063,
  0.3442034125328064,
  0.33819881081581116,
  0.2616749703884125,
  0.21297438442707062,
  0.16166891157627106,
  0.1436222642660141,
  0.04792564734816551,
  0.042587701231241226,
  0.03838364779949188,
  0.03138304501771927,


In [7]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [8]:
def generate_predictions(description, genre_list, threshold=0.75):

    device = "cuda" if torch.cuda.is_available() else "cpu"
    pipe.model.to(device)
    
    with torch.inference_mode(): 
        result = pipe(description, genre_list, multi_label=True)


    predicted_genre = [label for label, score in zip(result['labels'], result['scores']) if score >= threshold]

    if len(predicted_genre) > 3:
        predicted_genre = predicted_genre[:3]

    if not predicted_genre:
        predicted_genre = [result['labels'][0]]

    primary_genre = predicted_genre[0] if predicted_genre else None

    return primary_genre, predicted_genre

In [9]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Generate Predictions
actual_genres = []
predicted_genres = []

for i in tqdm(range(10)):  # Process first 1000 samples
    sequence = shows['overview'].iloc[i]
    primary_genre, predicted_genre = generate_predictions(sequence, unique_genres_list)
    
    actual_genres.append(shows['genre_list'].iloc[i])  # Actual genres
    predicted_genres.append(predicted_genre)  # Predicted genres


 90%|█████████ | 9/10 [00:03<00:00,  2.84it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 10/10 [00:03<00:00,  2.88it/s]


In [10]:
print("Sample Predicted Genres:", predicted_genres[:10])  
print("Sample Actual Genres:", actual_genres[:10])

Sample Predicted Genres: [['Family', 'Sci-Fi & Fantasy', 'War & Politics'], ['Mystery', 'Crime', 'Action & Adventure'], ['Mystery', 'Action & Adventure', 'Sci-Fi & Fantasy'], ['Family'], ['Crime'], ['Drama'], ['Kids'], ['Family'], ['Action & Adventure'], ['Sci-Fi & Fantasy']]
Sample Actual Genres: ["['Sci-Fi & Fantasy', 'Drama', 'Action & Adventure']", "['Crime', 'Drama']", "['Drama', 'Sci-Fi & Fantasy', 'Mystery']", "['Action & Adventure', 'Drama', 'Sci-Fi & Fantasy']", "['Crime', 'Sci-Fi & Fantasy']", "['Crime', 'Drama', 'Mystery']", "['Action & Adventure', 'Mystery', 'Drama']", "['Drama', 'Crime']", "['Drama']", "['Sci-Fi & Fantasy', 'Mystery', 'Drama']"]


In [11]:
generate_predictions(sequence, unique_genres_list)

('Sci-Fi & Fantasy', ['Sci-Fi & Fantasy'])

In [12]:
sequence

'Wanda Maximoff and Vision—two super-powered beings living idealized suburban lives—begin to suspect that everything is not as it seems.'

In [13]:
# shows['genre_list'] = shows['genre_list'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# missing_genres = (shows["primary_genre"] == "Unknown") & (shows['genre_list'].apply(lambda x: len(x) == 0))

# shows.loc[missing_genres, ["primary_genre", 'genre_list']] = shows.loc[missing_genres, 'overview'].apply(
#     lambda x: pd.Series(generate_predictions(x, unique_genres_list))
# )

# shows[['overview', 'primary_genre', 'genre_list']].head()

In [14]:
import numpy as np
import torch
from tqdm import tqdm

batch_size = 512

# Get indices of missing genres
missing_indices = shows.index[
    (shows["primary_genre"] == "Unknown") & 
    (shows['genre_list'].astype(str)=="[]")
]

# Convert to list for easier batching
missing_overviews = shows.loc[missing_indices, "overview"].tolist()

len(missing_indices)

67024

In [17]:

# Store predictions
predicted_genres_list = []
primary_genres_list = []

# Process in batches
with torch.inference_mode(), torch.amp.autocast("cuda"):
    for i in tqdm(range(0, len(missing_overviews), batch_size), desc="Processing Batches"):
        batch = missing_overviews[i : i + batch_size]

        # Get predictions in batch
        batch_results = [generate_predictions(desc, unique_genres_list) for desc in batch]

        # Separate predictions
        batch_primary_genres, batch_predicted_genres = zip(*batch_results)

        predicted_genres_list.extend(batch_predicted_genres)
        primary_genres_list.extend(batch_primary_genres)

# Convert results to a DataFrame
results_df = pd.DataFrame({
    "genre_list": predicted_genres_list,
    "primary_genre": primary_genres_list
}, index=missing_indices)

# Update original DataFrame
shows.update(results_df)

print("✅ Processing complete! Updated DataFrame successfully.")

Processing Batches: 100%|██████████| 131/131 [7:26:59<00:00, 204.73s/it]  

✅ Processing complete! Updated DataFrame successfully.





In [19]:
shows.to_csv("shows_with_genre_list.csv", index=False)