In [5]:
from google.colab import drive

!pip install torch
import random
import json
import torch
from statistics import mean
from pathlib import Path

from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    Trainer,
    TrainingArguments
)
from sentence_transformers import SentenceTransformer
import joblib


drive.mount('/content/drive')
!ls /content/drive
!ls /content/drive/Shareddrives


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [6]:

data_path = Path("/content/drive/Shareddrives/ECS271/unique_normal_funny_dataset.jsonl")

# Load the data manually from the JSONL file
data = []
with open(data_path, "r", encoding="utf-8") as f:
    for line in f:
        data.append(json.loads(line))

# Convert the list of dictionaries into a Hugging Face Dataset
dataset = Dataset.from_list(data)

# Print the dataset structure to verify
print(dataset)

Dataset({
    features: ['input', 'target'],
    num_rows: 5000
})


In [7]:
#filtering out the unfunny instances
import joblib
from sentence_transformers import SentenceTransformer

nn_model = joblib.load("/content/drive/Shareddrives/ECS271/NeuralNet_model.pkl")
lr_model = joblib.load("/content/drive/Shareddrives/ECS271/LogisticRegression_model.pkl")


bert_model = SentenceTransformer("/content/drive/Shareddrives/ECS271/bert_joke_model")


filtered_data = []

for entry in data:
    funny_sentence = entry["target"]
    features = bert_model.encode([funny_sentence])  # List of 1

    nn_pred = nn_model.predict(features)[0]
    lr_pred = lr_model.predict(features)[0]

    # Keep only if both predict it’s funny
    if nn_pred == 1 and lr_pred == 1:
        filtered_data.append(entry)

output_path = "/content/drive/Shareddrives/ECS271/filtered_funny_dataset.jsonl"
with open(output_path, "w", encoding="utf-8") as f:
    for item in filtered_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [8]:
from datasets import load_dataset, Dataset
from pathlib import Path
import json

data_path = Path("/content/drive/Shareddrives/ECS271/filtered_funny_dataset.jsonl") #using FILTERED DATA SET NOW TO TRAIN

# Load the data manually from the JSONL file
data = []
with open(data_path, "r", encoding="utf-8") as f:
    for line in f:
        data.append(json.loads(line))

# Convert the list of dictionaries into a Hugging Face Dataset
dataset = Dataset.from_list(data)

# Print the dataset structure to verify
print(dataset)

Dataset({
    features: ['input', 'target'],
    num_rows: 4359
})


In [9]:
#preprocessing filtered unfunny - funny dataset before training t5 model
from transformers import T5Tokenizer
from datasets import DatasetDict # Import DatasetDict

tokenizer = T5Tokenizer.from_pretrained('t5-large')

def preprocess(example):
    input_enc = tokenizer(example['input'], truncation=True, padding='max_length', max_length=64)
    target_enc = tokenizer(example['target'], truncation=True, padding='max_length', max_length=64)
    return {
        'input_ids': input_enc.input_ids,
        'attention_mask': input_enc.attention_mask,
        'labels': target_enc.input_ids,
    }

# Apply the preprocess function to the dataset
tokenized_dataset = dataset.map(preprocess, batched=False)

# Split the tokenized dataset into training and validation sets
# Use train_test_split to create a DatasetDict
# rename_columns is used here to name the splits 'train' and 'validation'
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.2, seed=42, shuffle=True)

# Print the structure of the split dataset to verify
print(tokenized_dataset)

Map:   0%|          | 0/4359 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input', 'target', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3487
    })
    test: Dataset({
        features: ['input', 'target', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 872
    })
})


In [10]:
#training specifications
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments
# Ensure DatasetDict is imported if not already
from datasets import DatasetDict # This import might already be present

model = T5ForConditionalGeneration.from_pretrained('t5-large')

training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='steps', # Now we have an eval dataset
    eval_steps=500,
    save_steps=1000,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    save_total_limit=2,
    report_to="none" # Add this line to disable all integrations, including wandb
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],  # Use the 'train' split
    eval_dataset=tokenized_dataset['test'],  # Use the 'test' split (default name for the split)
    tokenizer=tokenizer,
)

model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

  trainer = Trainer(


In [2]:
#training t5-model
trainer.train()
model_dir = "/content/drive/Shareddrives/ECS271/large-funny-t5-model"

model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)

NameError: name 'trainer' is not defined

In [12]:
model_dir = "/content/drive/Shareddrives/ECS271/large-funny-t5-model"

In [27]:
#seeing how our model performs
from transformers import T5ForConditionalGeneration, T5Tokenizer


model = T5ForConditionalGeneration.from_pretrained(model_dir)
tokenizer = T5Tokenizer.from_pretrained(model_dir)
model.eval()

sentences = [
    "Make this sentence funny: I spilled coffee all over my shirt right before a meeting.",
    "Make this sentence funny: My cat ignores me unless I have food.",
    "Make this sentence funny: I tried cooking dinner and nearly set off the fire alarm.",
    "Make this sentence funny: I forgot my keys inside the house and had to wait outside.",
    "Make this sentence funny: The Wi-Fi went out just when I needed it the most."
]

for sentence in sentences:
    inputs = tokenizer(sentence, return_tensors="pt")

    outputs = model.generate(
        input_ids=inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_length=60,
        do_sample=True,
        temperature=0.8,
        top_p=0.9,
        top_k=50,
        num_return_sequences=1
    )

    funnier_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Input: {sentence}")
    print(f"Funnier: {funnier_sentence}\n")


Input: Make this sentence funny: I spilled coffee all over my shirt right before a meeting.
Funnier: I spilled coffee all over my shirt right before a meeting, because apparently, my mind is too focused on a meeting to actually do anything!

Input: Make this sentence funny: My cat ignores me unless I have food.
Funnier: My cat ignores me unless I have food, because even cats don't like having to fight over who gets to eat the first bite!

Input: Make this sentence funny: I tried cooking dinner and nearly set off the fire alarm.
Funnier: I tried cooking dinner and nearly set off the fire alarm, because even my dinner had a tendency to set off alarms too.

Input: Make this sentence funny: I forgot my keys inside the house and had to wait outside.
Funnier: I forgot my keys inside the house and had to wait outside because they kept squeezing my pockets too hard!

Input: Make this sentence funny: The Wi-Fi went out just when I needed it the most.
Funnier: The Wi-Fi went out just when I need

In [30]:
sentences = [
    "Make this sentence funny: I'm going to be late to the meeting because my car broke down.",
]
two_hundred_outputs = []

In [33]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for i in range(200):
    for sentence in sentences:
        inputs = tokenizer(sentence, return_tensors="pt").to(device)  # Move inputs to GPU

        outputs = model.generate(
            input_ids=inputs.input_ids,
            attention_mask=inputs.attention_mask,
            max_length=60,
            do_sample=True,
            temperature=0.8,
            top_p=0.9,
            top_k=50,
            num_return_sequences=1
        )

        funnier_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
        print(f"Input: {sentence}")
        print(f"Funnier: {funnier_sentence}\n")
        two_hundred_outputs.append(funnier_sentence)

Input: Make this sentence funny: I'm going to be late to the meeting because my car broke down.
Funnier: I'm going to be late to the meeting because my car broke down – and I'm not just a zombie, I'm a "Pink Lady."

Input: Make this sentence funny: I'm going to be late to the meeting because my car broke down.
Funnier: I'm going to be late to the meeting because my car broke down. Guess it's my car, not my schedule!

Input: Make this sentence funny: I'm going to be late to the meeting because my car broke down.
Funnier: I'm going to be late to the meeting because my car broke down.

Input: Make this sentence funny: I'm going to be late to the meeting because my car broke down.
Funnier: I'm going to be late to the meeting because my car broke down and I just forgot to bring my broom!

Input: Make this sentence funny: I'm going to be late to the meeting because my car broke down.
Funnier: I'm going to be late to the meeting because my car broke down. I'm taking my work with me!

Input: M

In [34]:
print(two_hundred_outputs)

["I'm going to be late to the meeting because my car broke down. Luckily, my mate was still at home and he knew I had a better excuse for being late!", "I'm going to be late to the meeting because my car broke down, and I just realized I'm not supposed to show up at all.", "I'm going to be late to the meeting because my car broke down. It's probably going to be the best excuse ever for my car to start a fire without me!", 'I\'m going to be late to the meeting because my car broke down – and I\'m not just a zombie, I\'m a "Pink Lady."', "I'm going to be late to the meeting because my car broke down. Guess it's my car, not my schedule!", "I'm going to be late to the meeting because my car broke down.", "I'm going to be late to the meeting because my car broke down and I just forgot to bring my broom!", "I'm going to be late to the meeting because my car broke down. I'm taking my work with me!", "I'm going to be late to the meeting because my car broke down. Turns out, my car decided to g

In [None]:
import joblib
from sentence_transformers import SentenceTransformer
from transformers import T5ForConditionalGeneration, T5Tokenizer
from datasets import load_dataset
from pathlib import Path
import json


# Load dataset
data_path = Path("/content/drive/Shareddrives/ECS271/unique_normal_funny_dataset.jsonl")

# Load the data manually from the JSONL file
data = []
with open(data_path, "r", encoding="utf-8") as f:
    for line in f:
        data.append(json.loads(line))

# Convert the list of dictionaries into a Hugging Face Dataset
dataset = Dataset.from_list(data)

inputs = [item['input'] for item in dataset]

# Load models
nn_model = joblib.load("/content/drive/Shareddrives/ECS271/NeuralNet_model.pkl")
lr_model = joblib.load("/content/drive/Shareddrives/ECS271/LogisticRegression_model.pkl")
bert_model = SentenceTransformer("/content/drive/Shareddrives/ECS271/bert_joke_model")

t5_model = T5ForConditionalGeneration.from_pretrained(model_dir)
tokenizer = T5Tokenizer.from_pretrained(model_dir)
t5_model.eval()

# Function to generate funnier sentence
def generate_funny_sentence(sentence):
    inputs = tokenizer(sentence, return_tensors="pt")
    outputs = t5_model.generate(
        input_ids=inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_length=60,
        do_sample=True,
        temperature=0.8,
        top_p=0.9,
        top_k=50,
        num_return_sequences=1
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

filtered_data = []

for entry in dataset:
    original_input = entry["input"]

    # Generate funnier sentence from T5
    funnier_sentence = generate_funny_sentence(original_input)

    # Get embedding for the generated sentence
    features = bert_model.encode([funnier_sentence])
    print(f"finished instance {entry}")
    # Predict funniness with both models
    nn_pred = nn_model.predict(features)[0]
    lr_pred = lr_model.predict(features)[0]

    # Keep only if both models agree it’s funny
    if nn_pred == 1 and lr_pred == 1:
        filtered_data.append({
            "input": original_input,
            "funnier_sentence": funnier_sentence,
            "nn_pred": nn_pred,
            "lr_pred": lr_pred
        })

print(f"Kept {len(filtered_data)} funny sentences out of {len(dataset)}")


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
finished instance {'input': 'Make this sentence funny: The box was so tightly packed that it was difficult to remove the items without tearing the packaging.', 'target': 'The box was so tightly packed that trying to remove the items without tearing the packaging felt like playing Operation with a grizzly bear as your assistant.'}
finished instance {'input': 'Make this sentence funny: Colorful receipts make it easier to organize my expenses by category.', 'target': 'Colorful receipts make it easier to organize my expenses by category, which is great because my spending at clown school is through the roof!'}
finished instance {'input': 'Make this sentence funny: I just put the new sheets on the bed.', 'target': 'I just put the new sheets on the bed, because apparently, the bed can’t make itself... Lazy thing!'}
finished instance {'input': 'Make this sentence funny: I organize all my bills and receipts neatly in a dedicated 

In [14]:

# Load model and tokenizer
t5_model = T5ForConditionalGeneration.from_pretrained(model_dir)
tokenizer = T5Tokenizer.from_pretrained(model_dir)
t5_model.eval()

# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
t5_model.to(device)

# Your generation function
def generate_funny_sentence(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True).to(device)
    outputs = t5_model.generate(
        input_ids=inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_length=60,
        do_sample=True,
        temperature=0.8,
        top_p=0.9,
        top_k=50,
        num_return_sequences=1
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Perplexity function
def calculate_perplexity(sentence):
    encodings = tokenizer(sentence, return_tensors="pt", truncation=True).to(device)
    with torch.no_grad():
        outputs = t5_model(**encodings, labels=encodings.input_ids)
        loss = outputs.loss
        return torch.exp(loss).item()

# Use first 300 entries for reproducibility
sample_data = [dataset[i] for i in range(300)]
perplexities = []

for i, entry in enumerate(sample_data):
    original = entry["input"]
    funnier_sentence = generate_funny_sentence(original)

    orig_ppl = calculate_perplexity(original)
    gen_ppl = calculate_perplexity(funnier_sentence)

    perplexities.append({
        "original": original,
        "generated": funnier_sentence,
        "original_ppl": orig_ppl,
        "generated_ppl": gen_ppl,
        "delta": gen_ppl - orig_ppl
    })

    if i % 25 == 0:
        print(f"[{i}/300] Original PPL: {orig_ppl:.2f}, Generated PPL: {gen_ppl:.2f}")

# Averages
avg_orig_ppl = mean([x["original_ppl"] for x in perplexities])
avg_gen_ppl = mean([x["generated_ppl"] for x in perplexities])
avg_delta = mean([x["delta"] for x in perplexities])

print("\n--- Perplexity Evaluation ---")
print(f"Average Original Perplexity:  {avg_orig_ppl:.2f}")
print(f"Average Generated Perplexity: {avg_gen_ppl:.2f}")
print(f"Average Delta (Gen - Orig):   {avg_delta:.2f}")



ValueError: could not determine the shape of object type 'torch.storage.UntypedStorage'