In [2]:
import pandas as pd

In [3]:
import torch
print("GPU Available:", torch.cuda.is_available())
print("GPU Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")

GPU Available: True
GPU Name: Tesla T4


In [5]:
# Load the dataset
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
file_path = "/content/drive/My Drive/DSC 514 NLP/movie_spoiler_sample.csv"
df = pd.read_csv(file_path)
df.head()

Mounted at /content/drive


Unnamed: 0,movie_id,plot_summary,duration,genre,rating_x,release_date,plot_synopsis,review_date,user_id,is_spoiler,review_text,rating_y,review_summary
0,tt0318649,Master explorer and former US Navy Seal Dirk P...,2h 4min,"['Action', 'Adventure', 'Comedy']",6.0,2005-04-08,The film begins with a prologue set in Richmon...,27 May 2005,ur3270789,False,The film starts in the Richmond battle (1865) ...,7,Splendid adventure film with mesmerizing deser...
1,tt0803096,When the world of the Orcs of Draenor is being...,2h 3min,"['Action', 'Adventure', 'Fantasy']",6.9,2016-06-10,"For ages in the region of Middle Earth, humans...",20 June 2016,ur47788388,True,I saw the movie with six friends and we all lo...,10,Epic movie for fans and non fans
2,tt0425112,"Top London cop, PC Nicholas Angel is good. Too...",2h 1min,"['Action', 'Comedy']",7.9,2007-04-20,Nicholas Angel (Simon Pegg) is undoubtedly Lon...,11 June 2007,ur14440242,True,"I enjoyed this movie very much, but it being l...",8,once again a funny British film
3,tt0327056,In the summer of 1975 in a neighborhood in Bos...,2h 18min,"['Crime', 'Drama', 'Mystery']",8.0,2003-10-15,"Three young boys, all the best of friends, are...",5 November 2015,ur63623011,False,What you get here is no more than Clint eastwo...,1,For he made a 'plot twist' out of it
4,tt0259711,"Incarcerated and charged with murder, David Aa...",2h 16min,"['Fantasy', 'Mystery', 'Romance']",6.9,2001-12-14,"David Aames (Tom Cruise) drives to work, he fi...",12 August 2016,ur2781970,False,"I think the first time I saw this movie, I did...",10,As brilliant as I recalled it!


In [6]:
# Select relevant columns
df_selected = df[["review_text", "is_spoiler"]]


df_selected.loc[:, "is_spoiler"] = df_selected["is_spoiler"].astype(int)


# Check the cleaned dataset
print(df_selected.head())

                                         review_text  is_spoiler
0  The film starts in the Richmond battle (1865) ...           0
1  I saw the movie with six friends and we all lo...           1
2  I enjoyed this movie very much, but it being l...           1
3  What you get here is no more than Clint eastwo...           0
4  I think the first time I saw this movie, I did...           0


  df_selected.loc[:, "is_spoiler"] = df_selected["is_spoiler"].astype(int)


In [7]:
from transformers import RobertaTokenizer

# Load RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# Tokenize all reviews
def tokenize_text(text):
    return tokenizer(text, padding="max_length", truncation=True, max_length=512, return_tensors="pt")

# Apply tokenization
tokenized_data = df_selected["review_text"].apply(tokenize_text)

# Extract input tensors
input_ids = torch.cat([x["input_ids"] for x in tokenized_data], dim=0)
attention_masks = torch.cat([x["attention_mask"] for x in tokenized_data], dim=0)

# Convert labels to tensor (ensure they are integers, not booleans)
labels = torch.tensor(df_selected["is_spoiler"].astype(int).tolist(), dtype=torch.long)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [8]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split

# Split into training and validation sets (80% train, 20% validation)
train_inputs, val_inputs, train_labels, val_labels = train_test_split(input_ids, labels, test_size=0.2, random_state=42)
train_masks, val_masks, _, _ = train_test_split(attention_masks, labels, test_size=0.2, random_state=42)

# Set batch size
batch_size = 16  # Adjust based on GPU memory

# Create PyTorch DataLoader
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

In [9]:
from transformers import RobertaForSequenceClassification

# Load pre-trained RoBERTa with 2 output classes (spoiler or not)
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [10]:
from transformers import AdamW

# Define optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Define loss function
loss_fn = torch.nn.CrossEntropyLoss()

# Learning rate scheduler
from transformers import get_scheduler

num_training_steps = len(train_dataloader) * 3  # 3 epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)




In [None]:
def train(model, train_dataloader, optimizer, loss_fn, device):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        input_ids, attention_mask, labels = [x.to(device) for x in batch]  # Move batch to GPU

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.logits, labels)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_loss += loss.item()

    return total_loss / len(train_dataloader)

# Run training
epochs = 3
for epoch in range(epochs):
    avg_loss = train(model, train_dataloader, optimizer, loss_fn, device)
    print(f"Epoch {epoch+1}, Loss: {avg_loss}")

Epoch 1, Loss: 0.560576845929254


In [None]:
from sklearn.metrics import accuracy_score

def evaluate(model, val_dataloader, device):
    model.eval()
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, labels = [x.to(device) for x in batch]

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            # Convert logits to predicted labels
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            labels = labels.cpu().numpy()

            predictions.extend(preds)
            true_labels.extend(labels)

    return accuracy_score(true_labels, predictions)

# Run evaluation
accuracy = evaluate(model, val_dataloader, device)
print(f"Validation Accuracy: {accuracy:.2f}")

In [None]:
def predict_spoiler(review):
    model.eval()
    tokens = tokenizer(review, padding="max_length", truncation=True, max_length=512, return_tensors="pt").to(device)

    with torch.no_grad():
        logits = model(**tokens).logits

    prediction = torch.argmax(logits, dim=1).item()
    return "Spoiler" if prediction == 1 else "Not a Spoiler"

# Test with sample reviews
print(predict_spoiler("Bruce Willis was dead the whole time!"))
print(predict_spoiler("This movie had amazing cinematography!"))
