In [None]:
import pandas as pd

# Load the dataset to examine its structure
file_path = '/content/drive/My Drive/cleaned_movie_data.csv'
movie_data = pd.read_csv(file_path)

# Display the first few rows of the dataset
movie_data.head()


Unnamed: 0,id,title,genres,plot_summary,high_Level_Genres
0,23890098,Taxi Blues,Drama,shlykov a hardworking taxi driver and lyosha a...,Drama
1,23890098,Taxi Blues,World cinema,shlykov a hardworking taxi driver and lyosha a...,International
2,31186339,The Hunger Games,Action/Adventure,the nation of panem consists of a wealthy capi...,Adventure
3,31186339,The Hunger Games,Science Fiction,the nation of panem consists of a wealthy capi...,Science Fiction
4,31186339,The Hunger Games,Action,the nation of panem consists of a wealthy capi...,Action


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Select relevant columns for modeling
X = movie_data['plot_summary']
y = movie_data['high_Level_Genres']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Convert the plot summaries into TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train a baseline Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred = nb_classifier.predict(X_test_tfidf)

# Evaluate the model's performance
classification_report_baseline = classification_report(y_test, y_pred, output_dict=False)
print(classification_report_baseline)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                        precision    recall  f1-score   support

                Action       0.27      0.23      0.25      1694
             Adventure       0.26      0.16      0.20      1429
              Animated       0.20      0.06      0.10       619
                Comedy       0.25      0.30      0.28      3022
                  Cult       0.00      0.00      0.00       466
                 Drama       0.25      0.49      0.33      3827
                Family       0.31      0.15      0.21      1298
Historical/Documentary       0.51      0.18      0.26       956
                Horror       0.32      0.18      0.23       895
         International       0.24      0.14      0.17      2691
               Musical       0.40      0.00      0.01       554
               Romance       0.33      0.27      0.30      2263
       Science Fiction       0.23      0.23      0.23       984
            Short Film       0.28      0.14      0.19       883
              Thriller       0.38      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW, get_scheduler
from sklearn.utils.class_weight import compute_class_weight
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import EarlyStoppingCallback
import torch
import pandas as pd
import numpy as np

# Reload the dataset
file_path = '/content/drive/My Drive/cleaned_movie_data.csv'
movie_data = pd.read_csv(file_path)

# Concatenate metadata (language, release date) with plot_summary
#movie_data['enhanced_summary'] = movie_data['plot_summary'] + " [LANGUAGE] " + movie_data['genres']

#Enhanced Summary
movie_data['enhanced_summary'] = movie_data['plot_summary'] + " [LANGUAGE] " + movie_data['genres']

# Tokenizer for DistilBERT
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Encode labels using LabelEncoder
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(movie_data['high_Level_Genres'])

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)

# Custom Dataset Class
class EnhancedMovieDataset(Dataset):
    def __init__(self, summaries, labels, tokenizer, max_len=256):
        self.summaries = summaries
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.summaries)

    def __getitem__(self, idx):
        text = self.summaries.iloc[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Prepare dataset
dataset = EnhancedMovieDataset(movie_data['enhanced_summary'], y, tokenizer)

# Split into training and validation datasets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Prepare DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# Define the DistilBERT model
num_labels = len(label_encoder.classes_)
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=3e-5)
num_training_steps = len(train_loader) * 3  # 3 epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Weighted loss function
criterion = torch.nn.CrossEntropyLoss(weight=class_weights_tensor.to(device))

# Training Loop
epochs = 3
train_loss_values = []

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = criterion(outputs.logits, labels)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        lr_scheduler.step()

    avg_loss = total_loss / len(train_loader)
    train_loss_values.append(avg_loss)
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss}")

print("Training completed!")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Loss: 1.1820739758865484
Epoch 2/3, Loss: 0.9994859552005769
Epoch 3/3, Loss: 0.8939110372498267
Training completed!


In [None]:
from sklearn.metrics import classification_report, accuracy_score
import torch

# Evaluate the model on the validation set
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()

        # Collect predictions and labels
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())

# Calculate evaluation metrics
accuracy = accuracy_score(all_labels, all_preds)
report = classification_report(all_labels, all_preds, target_names=label_encoder.classes_)

accuracy, report


(0.611556202961884,
 '                        precision    recall  f1-score   support\n\n                Action       0.56      0.61      0.59      1674\n             Adventure       0.46      0.54      0.50      1349\n              Animated       0.37      0.62      0.46       635\n                Comedy       0.85      0.57      0.69      3027\n                  Cult       0.33      0.59      0.42       513\n                 Drama       0.92      0.56      0.70      3819\n                Family       0.58      0.58      0.58      1305\nHistorical/Documentary       0.54      0.78      0.64       970\n                Horror       0.46      0.67      0.55       873\n         International       0.75      0.60      0.66      2744\n               Musical       0.29      0.63      0.40       555\n               Romance       0.65      0.66      0.65      2298\n       Science Fiction       0.39      0.55      0.46       999\n            Short Film       0.63      0.66      0.65       857\n 

In [None]:
print(report)

                        precision    recall  f1-score   support

                Action       0.56      0.61      0.59      1674
             Adventure       0.46      0.54      0.50      1349
              Animated       0.37      0.62      0.46       635
                Comedy       0.85      0.57      0.69      3027
                  Cult       0.33      0.59      0.42       513
                 Drama       0.92      0.56      0.70      3819
                Family       0.58      0.58      0.58      1305
Historical/Documentary       0.54      0.78      0.64       970
                Horror       0.46      0.67      0.55       873
         International       0.75      0.60      0.66      2744
               Musical       0.29      0.63      0.40       555
               Romance       0.65      0.66      0.65      2298
       Science Fiction       0.39      0.55      0.46       999
            Short Film       0.63      0.66      0.65       857
              Thriller       0.71      

In [None]:
from transformers import DistilBertTokenizer
import os
import shutil

# Define the directory to save the model
save_directory = '/content/drive/My Drive/fine_tuned_distilbert_model_final'

# Save the model and tokenizer
os.makedirs(save_directory, exist_ok=True)
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

# Compress the model directory into a zip file for download
shutil.make_archive(save_directory, 'zip', save_directory)

# Provide the path to download
save_zip_path = save_directory + '.zip'
save_zip_path


'/content/drive/My Drive/fine_tuned_distilbert_model_final.zip'

In [None]:
import torch

# Define device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Specify the path to your saved model
model_path = '/content/drive/My Drive/fine_tuned_distilbert_model_final'

# Load the model and tokenizer
model = DistilBertForSequenceClassification.from_pretrained(model_path)
tokenizer = DistilBertTokenizer.from_pretrained(model_path)
model.to(device)


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
import torch

# Define device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the fine-tuned model and tokenizer
model_path = '/content/drive/My Drive/fine_tuned_distilbert_model_final'
model = DistilBertForSequenceClassification.from_pretrained(model_path)
tokenizer = DistilBertTokenizer.from_pretrained(model_path)
model.to(device)

# Reload dataset
movie_data = pd.read_csv('/content/drive/My Drive/cleaned_movie_data.csv')
movie_data['enhanced_summary'] = movie_data['plot_summary'] + " [LANGUAGE] " + movie_data['genres']

# Reinitialize the LabelEncoder
label_encoder = LabelEncoder()
label_encoder.fit(movie_data['high_Level_Genres'])

# Define the input sentence for inference
input_text = "A hacker discovers a simulated reality controlled by machines."

# Preprocess the input using the tokenizer
encoded_input = tokenizer(
    input_text,
    truncation=True,
    padding='max_length',
    max_length=128,
    return_tensors='pt'
)

# Move input to the same device as the model
input_ids = encoded_input['input_ids'].to(device)
attention_mask = encoded_input['attention_mask'].to(device)

# Put the model in evaluation mode and make a prediction
model.eval()
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    predicted_class_id = torch.argmax(logits, dim=1).item()

# Map the predicted class ID back to the genre label
predicted_genre = label_encoder.inverse_transform([predicted_class_id])[0]
print(f"Predicted Genre: {predicted_genre}")


Predicted Genre: Science Fiction


In [None]:
print(label_encoder.classes_)

['Action' 'Adventure' 'Animated' 'Comedy' 'Cult' 'Drama' 'Family'
 'Historical/Documentary' 'Horror' 'International' 'Musical' 'Romance'
 'Science Fiction' 'Short Film' 'Thriller']


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Reload dataset
movie_data = pd.read_csv('/content/drive/My Drive/cleaned_movie_data.csv')
movie_data['enhanced_summary'] = movie_data['plot_summary'] + " [LANGUAGE] " + movie_data['genres']

# Input text for recommendations
input_text = "A group of explorers travels through a wormhole in space to ensure humanity's survival."

# Preprocess input and predict genres
encoded_input = tokenizer(input_text, truncation=True, padding='max_length', max_length=128, return_tensors='pt')
input_ids = encoded_input['input_ids'].to(device)
attention_mask = encoded_input['attention_mask'].to(device)

# Predict genres
model.eval()
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    top_2_class_ids = torch.argsort(logits, dim=1, descending=True)[0][:2].cpu().numpy()

top_2_genres = label_encoder.inverse_transform(top_2_class_ids)

# Filter movies
filtered_movies = movie_data[movie_data['high_Level_Genres'].isin(top_2_genres)]


# Compute TF-IDF similarity
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(filtered_movies['plot_summary'])
input_tfidf = tfidf_vectorizer.transform([input_text])
similarity_scores = cosine_similarity(input_tfidf, tfidf_matrix).flatten()
filtered_movies = filtered_movies.copy()
filtered_movies.loc[:, 'similarity'] = similarity_scores


# Get top 5 recommendations
top_5_movies = filtered_movies.sort_values(by='similarity', ascending=False).head(5)[['title', 'high_Level_Genres', 'similarity']]
print(top_2_genres)
print("\n")
print(top_5_movies)


['Science Fiction' 'Adventure']


                 title high_Level_Genres  similarity
2805         La Vallée         Adventure    0.188979
87969           Humans         Adventure    0.109379
82980  The Terminators   Science Fiction    0.105498
5867      Space Chimps         Adventure    0.100753
74684         The Cave   Science Fiction    0.097909


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
# Input text for recommendations
input_text = "A hacker discovers a simulated reality controlled by machines"

# Preprocess input and predict genres
encoded_input = tokenizer(input_text, truncation=True, padding='max_length', max_length=128, return_tensors='pt')
input_ids = encoded_input['input_ids'].to(device)
attention_mask = encoded_input['attention_mask'].to(device)

# Predict genres
model.eval()
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    top_2_class_ids = torch.argsort(logits, dim=1, descending=True)[0][:2].cpu().numpy()

top_2_genres = label_encoder.inverse_transform(top_2_class_ids)

# Filter movies
filtered_movies = movie_data[movie_data['high_Level_Genres'].isin(top_2_genres)]


# Compute TF-IDF similarity
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(filtered_movies['plot_summary'])
input_tfidf = tfidf_vectorizer.transform([input_text])
similarity_scores = cosine_similarity(input_tfidf, tfidf_matrix).flatten()
filtered_movies = filtered_movies.copy()
filtered_movies.loc[:, 'similarity'] = similarity_scores


# Get top 5 recommendations
top_5_movies = filtered_movies.sort_values(by='similarity', ascending=False).head(5)[['title', 'high_Level_Genres', 'similarity']]
print(top_2_genres)
print("\n")
print(top_5_movies)


['Science Fiction' 'Adventure']


                      title high_Level_Genres  similarity
105720  Younger and Younger   Science Fiction    0.173384
68161            The Matrix   Science Fiction    0.123298
68163            The Matrix         Adventure    0.123298
68162            The Matrix         Adventure    0.123298
86435         The Animatrix   Science Fiction    0.104622
