---
## 1. ENVIRONMENT SETUP & LIBRARY IMPORTS

In [1]:
print("▶ Step 1/7: Updating libraries...")
!pip install --upgrade -q transformers datasets accelerate scikit-learn

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from datasets import Dataset, DatasetDict
import torch
import transformers
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    TrainerCallback
)

print(f"✔ Transformers version: {transformers.__version__}")
if not torch.cuda.is_available(): print("WARNING: GPU is not active")
else: print(f"✔ GPU found: {torch.cuda.get_device_name(0)}")
print("-" * 50)

▶ Step 1/7: Updating libraries...


2025-08-27 20:34:31.809842: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756326871.838327     417 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756326871.846714     417 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


✔ Transformers version: 4.56.0.dev0
✔ GPU found: Tesla T4
--------------------------------------------------


---
## 2. DATA LOADING AND CLEANING

In [2]:

print("▶ Step 2/7: Loading IMDb 50K dataset...")
# !!! IMPORTANT: CHANGE THIS FILE PATH TO MATCH YOUR DATASET'S PATH !!!
file_path = '/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv' 
df = pd.read_csv(file_path)

print("✔ Dataset loaded successfully.")
print("-" * 50)

▶ Step 2/7: Loading IMDb 50K dataset...
✔ Dataset loaded successfully.
--------------------------------------------------


---
## 3. DATA PREPARATION FOR THE MODEL

In [3]:
print("▶ Step 3/7: Preparing data for model training...")

# Converting the 'sentiment' column to a numeric 'label' (positive=1, negative=0)
df['label'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

# Standardizing the column names
df = df[['review', 'label']]
df.rename(columns={'review': 'text'}, inplace=True)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])
dataset_dict = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'test': Dataset.from_pandas(test_df)
})
print("✔ Training and test sets created.")
print(dataset_dict)
print("-" * 50)

▶ Step 3/7: Preparing data for model training...
✔ Training and test sets created.
DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 40000
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 10000
    })
})
--------------------------------------------------


---
## 4. TOKENIZATION

In [4]:
print("▶ Step 4/7: Tokenizing the texts...")
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)
print("✔ Tokenization complete.")
print("-" * 50)

▶ Step 4/7: Tokenizing the texts...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

✔ Tokenization complete.
--------------------------------------------------


---
## 5. PREPARING THE BASE MODEL, METRICS, AND CALLBACK

In [5]:
print("▶ Step 5/7: Preparing the base model, metrics function, and live monitoring tool...")
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

class CustomProgressCallback(TrainerCallback):
    """
    A more robust Callback that uses logging events to show progress.
    """
    def on_log(self, args, state, control, logs=None, **kwargs):
        """
        This function is called automatically whenever the Trainer creates a log.
        This is the most reliable place to get the 'loss' value.
        """
        if state.is_local_process_zero and logs is not None:
            if 'loss' in logs:
                print(f"Step: {state.global_step}/{state.max_steps} | Loss: {logs['loss']:.4f}")

    def on_epoch_end(self, args, state, control, **kwargs):
        """Called at the end of each epoch."""
        if state.is_local_process_zero:
            print(f"\n--- EPOCH {int(state.epoch)} COMPLETED ---\n")
print("✔ Model, metrics, and monitoring tool are ready.")
print("-" * 50)

▶ Step 5/7: Preparing the base model, metrics function, and live monitoring tool...


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✔ Model, metrics, and monitoring tool are ready.
--------------------------------------------------


---
## 6. DEFINING TRAINING ARGUMENTS

In [7]:
print("▶ Step 6/7: Defining training settings (hyperparameters)...")
training_args = TrainingArguments(
    output_dir="./imdb_critic_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    per_device_train_batch_size=16, # You can try 32 with a T4 x2, but 16 is safer
    learning_rate=3e-5,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    disable_tqdm=True,
    report_to="none",
)
print("✔ Training settings are complete.")
print("-" * 50)

▶ Step 6/7: Defining training settings (hyperparameters)...
✔ Training settings are complete.
--------------------------------------------------


---
## 7. TRAINING THE MODEL

In [8]:
print("▶▶▶ Step 7/7: STARTING THE FINE-TUNING PROCESS! ◀◀◀")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[CustomProgressCallback()],
)
trainer.train()

print("\n🎉 TRAINING COMPLETED SUCCESSFULLY! 🎉")
final_model_path = "./imdb_critic_model_best"
trainer.save_model(final_model_path)
print(f"The best model was saved to '{final_model_path}'")

▶▶▶ Step 7/7: STARTING THE FINE-TUNING PROCESS! ◀◀◀


  trainer = Trainer(


Step: 500/3750 | Loss: 0.2853
{'loss': 0.2853, 'grad_norm': 444893.6875, 'learning_rate': 2.6008e-05, 'epoch': 0.4}
Step: 1000/3750 | Loss: 0.2144
{'loss': 0.2144, 'grad_norm': 379865.21875, 'learning_rate': 2.2008e-05, 'epoch': 0.8}

--- EPOCH 1 COMPLETED ---

{'eval_loss': 0.1719169169664383, 'eval_accuracy': 0.9369, 'eval_runtime': 101.0376, 'eval_samples_per_second': 98.973, 'eval_steps_per_second': 6.186, 'epoch': 1.0}




Step: 1500/3750 | Loss: 0.1595
{'loss': 0.1595, 'grad_norm': 509357.125, 'learning_rate': 1.8007999999999998e-05, 'epoch': 1.2}
Step: 2000/3750 | Loss: 0.1279
{'loss': 0.1279, 'grad_norm': 422271.9375, 'learning_rate': 1.4008e-05, 'epoch': 1.6}
Step: 2500/3750 | Loss: 0.1302
{'loss': 0.1302, 'grad_norm': 224262.6875, 'learning_rate': 1.0008e-05, 'epoch': 2.0}

--- EPOCH 2 COMPLETED ---

{'eval_loss': 0.21813704073429108, 'eval_accuracy': 0.9288, 'eval_runtime': 101.1402, 'eval_samples_per_second': 98.873, 'eval_steps_per_second': 6.18, 'epoch': 2.0}




Step: 3000/3750 | Loss: 0.0939
{'loss': 0.0939, 'grad_norm': 2452.546630859375, 'learning_rate': 6.008000000000001e-06, 'epoch': 2.4}
Step: 3500/3750 | Loss: 0.1613
{'loss': 0.1613, 'grad_norm': 207802.09375, 'learning_rate': 2.008e-06, 'epoch': 2.8}

--- EPOCH 3 COMPLETED ---

{'eval_loss': 0.6120579838752747, 'eval_accuracy': 0.9416, 'eval_runtime': 101.0598, 'eval_samples_per_second': 98.951, 'eval_steps_per_second': 6.184, 'epoch': 3.0}
{'train_runtime': 3813.9987, 'train_samples_per_second': 31.463, 'train_steps_per_second': 0.983, 'train_loss': 0.16715535176595053, 'epoch': 3.0}

🎉 TRAINING COMPLETED SUCCESSFULLY! 🎉
The best model was saved to './imdb_critic_model_best'


---
## 8. TESTING THE TRAINED MODEL

In [9]:
from transformers import pipeline

final_model_path = "./imdb_critic_model_best" 

# The device=0 parameter ensures this runs on the GPU.
imdb_critic_pipeline = pipeline("sentiment-analysis", model=final_model_path, device=0)

# Custom film reviews for testing
test_reviews = [
    "The plot was predictable and the characters were completely uninteresting.", # Clearly Negative
    "I was on the edge of my seat the entire time! What a masterpiece of cinema.", # Clearly Positive
    "It was an okay movie, not great but not terrible either.", # Neutral/Ambiguous
    "Visually stunning, but the story was surprisingly weak." # Mixed Sentiment
]

print("--- Predictions of the Expert Model on Custom Test Reviews ---")
for review, prediction in zip(test_reviews, imdb_critic_pipeline(test_reviews)):
    # The model's labels are LABEL_1 (positive) and LABEL_0 (negative)
    label = "Positive" if prediction['label'] == 'LABEL_1' else "Negative"
    score = prediction['score']
    print(f"\nReview: '{review}'\nPrediction: {label} (Score: {score:.4f})")

Device set to use cuda:0


--- Predictions of the Expert Model on Custom Test Reviews ---

Review: 'The plot was predictable and the characters were completely uninteresting.'
Prediction: Negative (Score: 1.0000)

Review: 'I was on the edge of my seat the entire time! What a masterpiece of cinema.'
Prediction: Positive (Score: 1.0000)

Review: 'It was an okay movie, not great but not terrible either.'
Prediction: Negative (Score: 0.9935)

Review: 'Visually stunning, but the story was surprisingly weak.'
Prediction: Negative (Score: 1.0000)


---
## 9. SYNTHESIZING FINDINGS FOR THE LLM

In [10]:
# Making Predictions on the Full Dataset with the Expert Model
print("▶ Analyzing all 50,000 IMDb reviews using the expert model...")

all_reviews_text = df['text'].tolist()

# FIX: Adding truncation=True to the pipeline call to prevent errors with reviews longer than 512 tokens.
all_predictions = imdb_critic_pipeline(all_reviews_text, truncation=True)

# Add the predictions and scores to the original DataFrame
df['sentiment_label'] = [pred['label'] for pred in all_predictions]
df['sentiment_score'] = [pred['score'] for pred in all_predictions]

# Convert LABEL_1 to Positive, LABEL_0 to Negative
df['sentiment_label'] = df['sentiment_label'].apply(lambda x: "Positive" if x == 'LABEL_1' else "Negative")
print("✔ All reviews have been labeled.")
print("-" * 50)


# Summarizing the Findings as Text
print("▶ Creating summary report for the LLM...")

# Calculate the overall sentiment distribution
sentiment_dist = df['sentiment_label'].value_counts(normalize=True) * 100
positive_percentage = sentiment_dist.get('Positive', 0)
negative_percentage = sentiment_dist.get('Negative', 0)

# Find the most confident positive and negative reviews from our model
top_positive_review = df[df['sentiment_label'] == 'Positive'].nlargest(1, 'sentiment_score').iloc[0]
top_negative_review = df[df['sentiment_label'] == 'Negative'].nlargest(1, 'sentiment_score').iloc[0]

# Combine everything into a single text block
summary_for_llm = f"""
Here is a data-driven analysis of 50,000 IMDb movie reviews:

**Overall Sentiment Distribution:**
- Approximately {positive_percentage:.1f}% of the reviews were classified as positive by our expert model.
- Approximately {negative_percentage:.1f}% of the reviews were classified as negative by our expert model.

**An Excerpt from the Most Confident Positive Review:**
- Review: "{top_positive_review['text'][:400]}..." 
  (Confidence score for being Positive: {top_positive_review['sentiment_score']:.4f})

**An Excerpt from the Most Confident Negative Review:**
- Review: "{top_negative_review['text'][:400]}..."
  (Confidence score for being Negative: {top_negative_review['sentiment_score']:.4f})
"""

print("✔ Summary report is ready!")
print(summary_for_llm)

▶ Analyzing all 50,000 IMDb reviews using the expert model...
✔ All reviews have been labeled.
--------------------------------------------------
▶ Creating summary report for the LLM...
✔ Summary report is ready!

Here is a data-driven analysis of 50,000 IMDb movie reviews:

**Overall Sentiment Distribution:**
- Approximately 50.3% of the reviews were classified as positive by our expert model.
- Approximately 49.7% of the reviews were classified as negative by our expert model.

**An Excerpt from the Most Confident Positive Review:**
- Review: "A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the referen..." 
  (Confidence score for being Positive: 1

---
## 10. LOADING THE LARGE LANGUAGE MODEL (LLAMA 3)

In [11]:
# Install required libraries from the latest source to ensure compatibility
!pip install -q git+https://github.com/huggingface/accelerate.git
!pip install -q git+https://github.com/huggingface/transformers.git
!pip install -q bitsandbytes

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

# Get the Hugging Face token from Kaggle Secrets
from kaggle_secrets import UserSecretsClient
try:
    user_secrets = UserSecretsClient()
    hf_token = user_secrets.get_secret("HUGGING_FACE_HUB_TOKEN")
except:
    print("WARNING: 'HUGGING_FACE_HUB_TOKEN' not found in Kaggle Secrets. The model download may fail.")
    hf_token = None

# Model ID
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

# 4-bit Quantization Settings (to fit the model into GPU memory)
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

print(f"▶ Loading the {model_id} model with 4-bit quantization...")
print("This process may take a few minutes depending on the model size.")

# Load the Model and Tokenizer
tokenizer_llm = AutoTokenizer.from_pretrained(model_id, token=hf_token)
model_llm = AutoModelForCausalLM.from_pretrained(
    model_id,
    token=hf_token,
    quantization_config=quantization_config,
    device_map="auto", # Automatically distributes the model across available GPUs
)

print("\n✔ LLM model loaded successfully!")

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
▶ Loading the meta-llama/Meta-Llama-3-8B-Instruct model with 4-bit quantization...
This process may take a few minutes depending on the model size.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]


✔ LLM model loaded successfully!


---
## 11. GENERATING THE FINAL CRITIQUE

In [12]:
print("▶ Preparing the prompt for the Master Writer (Llama 3)...")

# Prepare the special prompt format for Llama 3
messages = [
    {
        "role": "system",
        "content": "You are a world-class movie critic known for your deep and insightful analysis. Your task is to write a comprehensive, multi-paragraph critique based ONLY on the data-driven summary of 50,000 IMDb movie reviews provided by your research analyst.",
    },
    {
        "role": "user", 
        "content": summary_for_llm # <-- The summary we created earlier!
    },
]

# Convert the prompt into the format the model understands
prompt = tokenizer_llm.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer_llm(prompt, return_tensors="pt").to("cuda")

print("\n▶▶▶ The Master Writer (Llama 3) is generating the critique...")
print("This process may take a few minutes.")

# Generate text by running the model
outputs = model_llm.generate(
    **inputs,
    max_new_tokens=1024, # Maximum number of new words/tokens to generate
    do_sample=True,      # Use sampling to increase creativity
    temperature=0.7,     # Lower temperature for more focused and less random text (0.1 - 1.0)
    top_p=0.9,           # Select from the most probable words
)

# Decode the generated text (clean it up)
response_text = tokenizer_llm.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)

print("\n--- FINAL REPORT FROM THE AI CRITIC ---")
print(response_text)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


▶ Preparing the prompt for the Master Writer (Llama 3)...

▶▶▶ The Master Writer (Llama 3) is generating the critique...
This process may take a few minutes.

--- FINAL REPORT FROM THE AI CRITIC ---
Title: A Tale of Two Cinemas: A Critique of the Duality of Movie Reviews

In the vast expanse of the cinematic universe, it's rare to find a film that elicits a universal response from its audience. The 50,000 IMDb reviews analyzed by our research analyst reveal a fascinating dichotomy: half of the reviews are positive, while the other half are negative. This stark contrast begs the question: what drives this divide? Is it a reflection of the filmmakers' artistic vision, the audience's subjective experience, or something more nuanced?

Upon closer examination, the data suggests that the most confident positive reviews, exemplified by the excerpt provided, praise the film's technical aspects, such as cinematography and editing. The use of "old-time-BBC fashion" is described as "comforting" a

---
## 12. THE "CRITIC AGENT" FUNCTION

In [13]:
def ask_critic_agent(question, summary_text):
    """
    Sends the user's question and the summary text to Llama 3
    and gets a response from the "critic agent" persona.
    """
    
    # Ensure the summary text is not empty
    if not summary_text:
        print("ERROR: 'summary_text' is empty. Please ensure the summary was generated correctly.")
        return

    # Prepare the prompt format for Llama 3
    messages = [
        {
            "role": "system",
            "content": "You are a world-class movie critic. Your task is to answer questions and provide analysis based ONLY on the data-driven summary of 50,000 IMDb movie reviews that will be provided. Do not use any external knowledge.",
        },
        {
            "role": "user", 
            # We combine the summary data with the new question here
            "content": f"Here is the data summary:\n{summary_text}\n\nNow, based ONLY on that summary, answer the following question: {question}"
        },
    ]

    prompt = tokenizer_llm.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer_llm(prompt, return_tensors="pt").to("cuda")

    print(f"▶ The Critic Agent is thinking about the question: '{question[:50]}...'")
    outputs = model_llm.generate(
        **inputs,
        max_new_tokens=512,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
    )
    
    # Decode the generated text
    response_text = tokenizer_llm.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
    
    print("\n--- The Critic Agent's Answer ---")
    print(response_text)

# --- EXAMPLE OF HOW TO USE THE FUNCTION ---
# This assumes 'summary_for_llm' was created in a previous cell.

# my_question = "What is the main theme of the most

---
## 13. INTERACTING WITH THE CRITIC AGENT

In [14]:
question = "Based on the overall 50/50 sentiment, would you describe the audience's reaction as 'controversial'?"
ask_critic_agent(question,summary_for_llm)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


▶ The Critic Agent is thinking about the question: 'Based on the overall 50/50 sentiment, would you de...'

--- The Critic Agent's Answer ---
Based solely on the data summary, I would not describe the audience's reaction as "controversial". The data shows that the sentiment is evenly split, with 50.3% of reviews being positive and 49.7% being negative. This suggests that the audience is generally divided in their opinions, with no clear majority leaning towards either side. In the absence of any significant skew towards one extreme, I would not characterize the reaction as "controversial", which typically implies a strong and divisive opinion.
