# Technical Assignment Week 3

# Aygün Varol

## aygun.varol@tuni.fi

## Step 1: Download the IMDB Dataset (1 point) 

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv


In [3]:
# Import the Pandas library
import pandas as pd

# Define the path to the dataset
data_path = '/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv'

# Load the dataset into a DataFrame
df = pd.read_csv(data_path)

# Verify the dataset by displaying the first few rows and some info
print("First 5 rows of the dataset:")
print(df.head())

print("\nDataset Information:")
print(df.info())

print("\nDataset Shape:")
print(df.shape)

First 5 rows of the dataset:
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB
None

Dataset Shape:
(50000, 2)


## Step 2: Data Preprocessing (1 point)

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Assuming the dataset has been loaded into a DataFrame `df` as shown in Step 1.
# For example:
# df = pd.read_csv('/kaggle/input/imdb-dataset/IMDB Dataset.csv')

# 1. Clean and preprocess the dataset:

# a. Encode the sentiment column: positive -> 1, negative -> 0
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# b. Retain only the review and sentiment columns.
#    Optionally, rename the sentiment column to "label" for clarity.
df = df[['review', 'sentiment']]
df.rename(columns={'sentiment': 'label'}, inplace=True)

# Display the first few rows to verify the changes
print("Preprocessed Data:")
print(df.head())

# 2. Split the data into training, validation, and testing sets.

# First, split the data into training (80%) and testing (20%) sets.
# Using stratify ensures that the distribution of labels is maintained in both splits.
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

# Next, split the training data into training and validation sets.
# Here, we take 12.5% of the training data as validation which is equivalent to 10% of the total dataset 
# (0.125 * 80% = 10%).
train_data, val_data = train_test_split(train_data, test_size=0.125, random_state=42, stratify=train_data['label'])

# Verify the splits by printing out the shapes
print("\nData Split Shapes:")
print(f"Training set shape: {train_data.shape}")
print(f"Validation set shape: {val_data.shape}")
print(f"Test set shape: {test_data.shape}")

Preprocessed Data:
                                              review  label
0  One of the other reviewers has mentioned that ...      1
1  A wonderful little production. <br /><br />The...      1
2  I thought this was a wonderful way to spend ti...      1
3  Basically there's a family where a little boy ...      0
4  Petter Mattei's "Love in the Time of Money" is...      1

Data Split Shapes:
Training set shape: (35000, 2)
Validation set shape: (5000, 2)
Test set shape: (10000, 2)


## Step 3: Model Selection and Tokenization (1 point)

In [6]:
# Import necessary libraries
from transformers import AutoTokenizer
from datasets import Dataset

# 1. Select a pre-trained Hugging Face transformer model
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Assuming you have the following Pandas DataFrames from previous steps:
#   - train_data
#   - val_data
#   - test_data
# Each DataFrame contains two columns: "review" and "label".

# Optionally, convert your Pandas DataFrames to Hugging Face Dataset objects:
train_dataset = Dataset.from_pandas(train_data)
val_dataset   = Dataset.from_pandas(val_data)
test_dataset  = Dataset.from_pandas(test_data)

# 2. Tokenize the dataset with truncation, padding, and max_length set to 256.
def tokenize_function(example):
    return tokenizer(
        example["review"],
        truncation=True,          # Enable truncation to avoid sequences longer than max_length
        padding="max_length",       # Pad sequences to the maximum length
        max_length=256              # Set maximum sequence length
    )

# Apply the tokenization to each dataset split.
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset   = val_dataset.map(tokenize_function, batched=True)
test_dataset  = test_dataset.map(tokenize_function, batched=True)

# Optionally, remove the original text column if you don't need it for training.
train_dataset = train_dataset.remove_columns(["review"])
val_dataset   = val_dataset.remove_columns(["review"])
test_dataset  = test_dataset.remove_columns(["review"])

# If you're planning to use the Hugging Face Trainer for fine-tuning, it's common practice to rename the 'label' column to 'labels'
train_dataset = train_dataset.rename_column("label", "labels")
val_dataset   = val_dataset.rename_column("label", "labels")
test_dataset  = test_dataset.rename_column("label", "labels")

# Finally, set the format for PyTorch (or TensorFlow, if needed)
train_dataset.set_format("torch")
val_dataset.set_format("torch")
test_dataset.set_format("torch")

# Verify tokenization by checking an example from the training dataset.
print("Tokenized example from training dataset:")
print(train_dataset[0])

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/35000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Tokenized example from training dataset:
{'labels': tensor(0), '__index_level_0__': tensor(4427), 'input_ids': tensor([  101,  2002,  2428,  2439,  1996,  5436,  2007,  2023,  2028,   999,
         3904,  1997,  2010,  8200, 11749,  2015,  2182,  2012,  2035,  1010,
         2019,  4895, 18447, 18702,  3436,  5436,  1998,  3294,  6659,  3772,
         2191,  2023,  2010,  5409,  2143,  1006,  1999,  2026,  5448,  1007,
         1012,  2130,  2010, 11749, 13638,  2003,  2908,  1010,  3347,  2028,
         3496,  1999,  2019,  4082,  3004,  1012,  2821,  2092,  1010,  2012,
         2560,  2010,  2279,  2143,  1005, 10103,  4164,  1005,  3662,  2008,
         2002,  2071,  2145,  5213,  2043,  2002,  2359,  2000,  1012,  1012,
         1012,   102,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,   

## Step 4: Fine-Tune the Model (2 points)

In [13]:
# Disable wandb integration
import os
os.environ["WANDB_DISABLED"] = "true"

# Import necessary libraries
import numpy as np
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# 1. Load a pre-trained model for sequence classification.
#    Since the IMDB dataset has two labels (positive and negative), we set num_labels=2.
model_name = "distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# 2. Define a function to compute evaluation metrics.
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    # Use weighted average to account for any label imbalance.
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

# 3. Define training parameters with TrainingArguments.
training_args = TrainingArguments(
    output_dir="./results",
    run_name="imdb-finetuning-run",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# 4. Create a Trainer instance.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,         # Preprocessed and tokenized training dataset
    eval_dataset=val_dataset,             # Preprocessed and tokenized validation dataset
    compute_metrics=compute_metrics,      # Metrics function to compute accuracy, precision, recall, and F1-score
)

# 5. Fine-tune the model.
trainer.train()

# Optionally, evaluate the model after training.
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1971,0.213713,0.9176,0.917622,0.9176,0.917599


Evaluation Results: {'eval_loss': 0.21371318399906158, 'eval_accuracy': 0.9176, 'eval_precision': 0.9176216495063105, 'eval_recall': 0.9176, 'eval_f1': 0.9175989320821598, 'eval_runtime': 17.682, 'eval_samples_per_second': 282.774, 'eval_steps_per_second': 17.702, 'epoch': 1.0}


## Step 5: Save and Upload the Model to Hugging Face (2 points)

In [14]:
# 1. Save the fine-tuned model and tokenizer locally.
save_directory = "./finetuned-distilbert-imdb"

model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

print(f"Model and tokenizer saved locally in {save_directory}")

Model and tokenizer saved locally in ./finetuned-distilbert-imdb


In [19]:
# 2. Log in to Hugging Face using notebook_login.
# This will prompt you to enter your Hugging Face access token.
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [22]:
# 3. Upload the model and tokenizer to Hugging Face.
# Choose a repository name. The repository name should be unique and is typically in the format "username/repo_name".
# Replace 'your-username' and 'finetuned-distilbert-imdb' with your actual username and desired repo name.
repo_name = "Aygun/finetuned-distilbert-imdb"

# Push the model and tokenizer to the hub.
model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)

print(f"Model and tokenizer have been pushed to the Hugging Face Hub at: https://huggingface.co/{repo_name}")

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

Model and tokenizer have been pushed to the Hugging Face Hub at: https://huggingface.co/Aygun/finetuned-distilbert-imdb


# Part 2: API Development and Testing (5 points) 

## Step 6: Set Up the Backend API (1 point) 

In [27]:
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import random

# Initialize the FastAPI app
app = FastAPI()

# Define the request schema using Pydantic
class SentimentRequest(BaseModel):
    text: str
    model: str  # Expected values: "custom" or "llama"

# Define the response schema using Pydantic
class SentimentResponse(BaseModel):
    sentiment: str        # "positive" or "negative"
    confidence: float     # Confidence score (e.g., 0.95)

def analyze_sentiment(text: str, model: str) -> (str, float):
    """
    Dummy sentiment analysis function.
    Replace this logic with calls to your actual sentiment analysis models.
    """
    # Validate model parameter
    if model not in ["custom", "llama"]:
        raise ValueError("Invalid model specified. Please use 'custom' or 'llama'.")

    # Simple heuristic for demonstration:
    # If the text contains the word "good", we assume it's positive.
    # Otherwise, we assume it's negative.
    if "good" in text.lower():
        sentiment = "positive"
        confidence = round(random.uniform(0.80, 1.0), 2)
    else:
        sentiment = "negative"
        confidence = round(random.uniform(0.60, 0.79), 2)
    
    return sentiment, confidence

@app.post("/analyze/", response_model=SentimentResponse)
def analyze(request: SentimentRequest):
    """
    POST endpoint to analyze sentiment.
    - **text**: The input text to analyze.
    - **model**: The model to use ("custom" or "llama").
    Returns a JSON response with the sentiment and confidence score.
    """
    try:
        sentiment, confidence = analyze_sentiment(request.text, request.model)
        return SentimentResponse(sentiment=sentiment, confidence=confidence)
    except ValueError as e:
        raise HTTPException(status_code=400, detail=str(e))

# To run the app, use the command: uvicorn step9:app --reload
# uvicorn step6:app --reload

## Step 7: Load Models (1 point)

In [29]:
#  Load the Fine-Tuned Model from Hugging Face
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Hugging Face Hub
model_repo = "Aygun/finetuned-distilbert-imdb"

# Load the fine-tuned model and tokenizer from Hugging Face
model = AutoModelForSequenceClassification.from_pretrained(model_repo)
tokenizer = AutoTokenizer.from_pretrained(model_repo)

print("Fine-tuned model and tokenizer loaded from Hugging Face.")

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Fine-tuned model and tokenizer loaded from Hugging Face.


In [41]:
! pip install groq

Collecting groq
  Downloading groq-0.17.0-py3-none-any.whl.metadata (14 kB)
Downloading groq-0.17.0-py3-none-any.whl (109 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.8/109.8 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq
Successfully installed groq-0.17.0


In [59]:
#set GROQ_API_KEY in the secrets
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("GROQ_API_KEY")

In [63]:
import os
from groq import Groq

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
api_key = user_secrets.get_secret("GROQ_API_KEY")

# Create the Groq client
client = Groq(api_key=api_key)

# Set system prompt
system_prompt = {
    "role": "system",
    "content": "You are a helpful assistant. You reply with very short answers."
}

# Initialize chat history
chat_history = [system_prompt]

while True:
    user_input = input("You: ")
    
    if user_input.lower() in ["exit", "quit"]:
        print("Exiting chat.")
        break
    
    chat_history.append({"role": "user", "content": user_input})

    response = client.chat.completions.create(
        model="llama3-70b-8192",
        messages=chat_history,
        max_tokens=100,
        temperature=1.2
    )

    assistant_reply = response.choices[0].message.content
    chat_history.append({"role": "assistant", "content": assistant_reply})

    print("Assistant:", assistant_reply)

You:  Hello how are you?


Assistant: I'm good, thanks!


You:  exit


Exiting chat.


## Step 8: Test the API Locally (1 point)

In [2]:
! pip install reactpy

Collecting reactpy
  Downloading reactpy-1.1.0-py3-none-any.whl.metadata (4.1 kB)
Collecting asgiref>=3 (from reactpy)
  Downloading asgiref-3.8.1-py3-none-any.whl.metadata (9.3 kB)
Downloading reactpy-1.1.0-py3-none-any.whl (110 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.3/110.3 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading asgiref-3.8.1-py3-none-any.whl (23 kB)
Installing collected packages: asgiref, reactpy
Successfully installed asgiref-3.8.1 reactpy-1.1.0


In [None]:
import requests

# URL of the API endpoint
url = "http://127.0.0.1:8000/analyze/"

# Example payload for the custom model
payload_custom = {
    "text": "This movie was fantastic!",
    "model": "custom"
}

# Example payload for the llama model
payload_llama = {
    "text": "This movie was terrible.",
    "model": "llama"
}

headers = {"Content-Type": "application/json"}

# Test with custom model
response_custom = requests.post(url, json=payload_custom, headers=headers)
print("Custom Model Response:")
print(response_custom.json())

# Test with llama model
response_llama = requests.post(url, json=payload_llama, headers=headers)
print("Llama Model Response:")
print(response_llama.json())

## Step 9: Define the Llama 3 Prompt (1 point) 

### "Please analyze the sentiment of the following text and classify it as either 'positive' or 'negative'. Text: '{input_text}'. Provide your answer in the format: Sentiment: [positive/negative], Confidence: [percentage]."

## Step 10: Test with Both Models (1 point)

In [None]:
import requests

# URL of the locally running FastAPI service
url = "http://127.0.0.1:8000/analyze/"

# Payload for the custom (fine-tuned) model
payload_custom = {
    "text": "This movie was fantastic!",
    "model": "custom"  # This will trigger your fine-tuned Hugging Face model
}

# Payload for the Llama 3 model via Groq Cloud
payload_llama = {
    "text": "This movie was terrible.",
    "model": "llama"   # This will trigger your Groq Cloud integration with Llama 3
}

headers = {"Content-Type": "application/json"}

# Test the custom model endpoint
response_custom = requests.post(url, json=payload_custom, headers=headers)
print("Custom Model Response:")
print(response_custom.json())

# Test the Llama 3 model endpoint
response_llama = requests.post(url, json=payload_llama, headers=headers)
print("Llama Model Response:")
print(response_llama.json())

# Part 3: UI Design and Explanation (3 points)

## Step 11: React UI Design  (1 point)

In [None]:
import reactpy
from reactpy import component, html, hooks
import asyncio
import httpx  # For making asynchronous HTTP requests (optional)

# Replace with your actual backend endpoint URL
BACKEND_API_URL = "http://localhost:8000/analyze"

@component
def SentimentAnalyzer():
    # State for user input text
    text_input, set_text_input = hooks.use_state("")
    # State for the selected model from the dropdown
    selected_model, set_selected_model = hooks.use_state("Custom Model")
    # State for the result from the backend API
    result, set_result = hooks.use_state("")

    # Handler for text input change
    def handle_text_change(event):
        set_text_input(event["target"]["value"])

    # Handler for dropdown selection change
    def handle_model_change(event):
        set_selected_model(event["target"]["value"])

    # Handler for button click that calls the backend API
    async def analyze_sentiment(event):
        if not text_input.strip():
            set_result("Please enter some text to analyze.")
            return

        # Create the payload to send to your API
        payload = {
            "text": text_input,
            "model": selected_model,
        }

        try:
            # Use an async HTTP client to post the data.
            # Make sure your backend API is running and configured to accept these requests.
            async with httpx.AsyncClient() as client:
                response = await client.post(BACKEND_API_URL, json=payload)
                response.raise_for_status()
                data = response.json()
                # Expected response structure:
                # {
                #    "sentiment": "positive" or "negative",
                #    "confidence": 0.95  # optional
                # }
                sentiment = data.get("sentiment", "unknown")
                confidence = data.get("confidence", None)
                if confidence is not None:
                    set_result(f"Sentiment: {sentiment} (Confidence: {confidence:.2f})")
                else:
                    set_result(f"Sentiment: {sentiment}")
        except Exception as e:
            set_result(f"Error analyzing sentiment: {str(e)}")

    return html.div(
        {"style": {"fontFamily": "Arial, sans-serif", "maxWidth": "600px", "margin": "auto", "padding": "20px"}},
        html.h1("Sentiment Analyzer"),
        html.div(
            {"style": {"marginBottom": "10px"}},
            html.label({"for": "text-input"}, "Enter text:"),
            html.input({
                "id": "text-input",
                "type": "text",
                "value": text_input,
                "on_change": handle_text_change,
                "style": {"width": "100%", "padding": "8px", "marginTop": "5px"}
            })
        ),
        html.div(
            {"style": {"marginBottom": "10px"}},
            html.label({"for": "model-select"}, "Select model:"),
            html.select({
                "id": "model-select",
                "value": selected_model,
                "on_change": handle_model_change,
                "style": {"width": "100%", "padding": "8px", "marginTop": "5px"}
            },
                html.option({"value": "Custom Model"}, "Custom Model"),
                html.option({"value": "Llama 3"}, "Llama 3")
            )
        ),
        html.button({
            "on_click": analyze_sentiment,
            "style": {"padding": "10px 20px", "cursor": "pointer"}
        }, "Analyze Sentiment"),
        html.div(
            {"style": {"marginTop": "20px", "padding": "10px", "backgroundColor": "#f0f0f0"}},
            result
        )
    )

# Run the ReactPy app
if __name__ == "__main__":
    reactpy.run(SentimentAnalyzer)

## Step 12: Submit GitHub Repository (1 point)

# https://github.com/AygunVarol/sentiment-analysis-project

# https://huggingface.co/Aygun/finetuned-distilbert-imdb

## Step 13: Record a YouTube Demo Video (1 point)

# https://www.youtube.com/watch?v=9HHb9GtZIx4