In [None]:
### What you have:
# 1. A dataset of movie reviews from imdb structured as
# "review text", "good"
# "review text", "bad"
# "review text", "bad",
# ...
#"review text", "good"
#
# the second column indicates of the review was positive or negative
# the data is stored in the file "movie_reviews.csv"
#
# 2. The GPT2 transformer model

### Task 1: Finetune the model to complete movie review text. Examples:
#           "The movie was fantastic because",
#           "I didn't like the film because",
#           "One of the best performances was"
#
# Use the template below
# you may need to take care of additional library import/installation depending on your system

### Task 2: The second column in the "movie_reviews.csv" file indicates if the review was positive or negative. Without adding any more layers to GPT2, finetune the model to tell you wheather a review is positive or negative.

### The jupyter notebook with the template and the dataset movie_reviews are included with this email.

### Note: you can always downsample the dataset if the training is taking an accessively long time

In [None]:
# may be needed if you are running this in a google colab
!pip install -U accelerate
!pip install -U transformers

In [9]:
# Import necessary libraries
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2ForSequenceClassification

                                     TASK - 1 - TEXT GENERATION

In [7]:
# Step 2: Load fine tuned GPT-2 Model and Tokenizer for generation
tokenizer_gen = GPT2Tokenizer.from_pretrained('/Users/aravindryali/Desktop/Studies/AITask/outputs_gen/fine_tuned_gpt2')
model_gen = GPT2LMHeadModel.from_pretrained('/Users/aravindryali/Desktop/Studies/AITask/outputs_gen/fine_tuned_gpt2')
tokenizer_gen.pad_token = tokenizer_gen.eos_token  # Set pad_token to eos_token
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_gen.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [26]:
# Step 4: Run inference (on a few examples):

# Task 1:
import torch, re

def clean_generated_text(text):
    # Remove HTML-like tags
    clean_text = re.sub(r'<.*?>', '', text)
    return clean_text

def generate_text(prompt, model, tokenizer, max_length=100, device='cpu'):
    # Ensure the model and inputs are on the same device
    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)

    # Set attention mask
    attention_mask = torch.ones_like(input_ids).to(device)

    # Generate text using the model
    with torch.no_grad():
        output = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_length=max_length,
            num_return_sequences=1,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,# Ensure padding is handled correctly
            no_repeat_ngram_size=2,               # Optional: prevent repeating n-grams
            repetition_penalty=2.0,               # Optional: avoid repetitive phrases
            top_k=50,                             # Optional: limit sampling to top-k tokens
            top_p=0.95,                           # Optional: nucleus sampling
            temperature=0.7,                      # Optional: control randomness
            do_sample=True                        # Optional: enable sampling
        )

    # Decode the generated tokens back into text
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    clean_text = clean_generated_text(generated_text)
    return clean_text

# Example prompts
prompts = [
    "The movie was fantastic because",
    "I didn't like the film because",
    "One of the best performances was"
]

for prompt in prompts:
    print(f"Prompt: {prompt}")
    print(f"Generated Text: {generate_text(prompt, model_gen, tokenizer_gen, device=device)}")
    print("\n")


Prompt: The movie was fantastic because
Generated Text: The movie was fantastic because of the acting. The story line is very realistic and believable, with some pretty funny moments such as when a woman (and her cat) try to escape from an abusive husband by calling him "Mr." He does not seem too evil or dangerous at times but doesn't appear much like his brother-in law in this film.There are also several interesting character relationships between these characters that I really enjoyed watching - especially John Belushi's girlfriend/friend


Prompt: I didn't like the film because
Generated Text: I didn't like the film because I was not expecting much. The only reason it's a good movie is that there are some scenes of nudity and violence which makes things even more interesting.The acting in this one really works for me, especially considering what they did with Michael Caine as his partner-in law to get him killed off (as he already knew how). There isn



Prompt: One of the best perf

                                    TASK - 2 - REVIEW CLASSIFICATION

In [10]:
# Load fine tuned GPT-2 Model and Tokenizer for review classification
tokenizer_cls = GPT2Tokenizer.from_pretrained('/Users/aravindryali/Desktop/Studies/AITask/cls_output/gpt2-movie-reviews-classifier')
model_cls = GPT2ForSequenceClassification.from_pretrained('/Users/aravindryali/Desktop/Studies/AITask/cls_output/gpt2-movie-reviews-classifier')
tokenizer_gen.pad_token = tokenizer_gen.eos_token
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_gen.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [12]:
# Predict review if its is good or bad
def predict_review(review, model_cls, tokenizer_cls):
    inputs = tokenizer_cls(review, return_tensors='pt', truncation=True, padding='max_length', max_length=512)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    
    with torch.no_grad():
        outputs = model_cls(**inputs)
        
    logits = outputs.logits
    predicted_class_id = torch.argmax(logits, dim=-1).item()
    
    if predicted_class_id == 0:
        return "Negative"
    else:
        return "Positive"
    
# Example reviews for predicting
reviews = [
    "The movie was fantastic and the screenplay is amazing",
    "I didn't enjoy the film. It was very boring",
    "One of the best experiences ever. worth watching."
]

# Predict and print the results
for review in reviews:
    prediction = predict_review(review, model_cls, tokenizer_cls)
    print(f"Review: {review}\nPrediction: {prediction}\n")

Review: The movie was fantastic and the screenplay is amazing
Prediction: Positive

Review: I didn't enjoy the film. It was very boring
Prediction: Positive

Review: One of the best experiences ever. worth watching.
Prediction: Positive

