# Requirements

In [1]:
# Add as many imports as you need.
import pandas as pd 
import numpy as np 
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset, load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt




# Laboratory Exercise - Run Mode (8 points)

## Introduction
This laboratory assignment's primary objective is to fine-tune a pre-trained language model for binary classification on a dataset consisting of wine reviews. The dataset contains two attributes: **description** and **points**. The description is a brief text describing the wine and the points represent a quality metric ranging from 1 to 100. If some wine has at least 90 points it is considered **exceptional**. Your task involves predicting if some wine is **exceptional** based on its review.

## The Wine Reviews Dataset

Load the dataset using the `datasets` library.

In [2]:
# Write your code here. Add as many boxes as you need.
dataset = load_dataset("csv",data_files=["wine-reviews.csv"])
dataset

DatasetDict({
    train: Dataset({
        features: ['description', 'points'],
        num_rows: 10000
    })
})

In [3]:
df = dataset["train"].to_pandas()
df["points"].unique()

array([ 85,  92,  93,  89,  87,  86,  94,  90,  91,  83,  96,  95,  88,
        84,  82,  81, 100,  80,  97,  98,  99], dtype=int64)

## Target Extraction
Extract the target **exceptional** for each wine review. If some wine has at least 90 points it is considered **exceptional**.

In [4]:
# Write your code here. Add as many boxes as you need.
df['exceptional'] = (df['points'] >= 90).astype(int)
df['exceptional'].value_counts()

exceptional
0    5000
1    5000
Name: count, dtype: int64

## Dataset Splitting
Partition the dataset into training and testing sets with an 80:20 ratio.


In [5]:
# Write your code here. Add as many boxes as you need.
X = df['description']
y = df['exceptional']
train_texts, test_texts, train_labels, test_labels = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [6]:
[len(train_texts),len(test_texts)]

[8000, 2000]

## Tokenization
Tokenize the texts using the `AutoTokenizer` class.

In [7]:
# Write your code here. Add as many boxes as you need.
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [8]:
train_enc = tokenizer(list(train_texts), truncation=True, padding=True, max_length=512)
test_enc = tokenizer(list(test_texts), truncation=True, padding=True, max_length=512)

In [9]:
train_dataset = Dataset.from_dict({
    'input_ids' : train_enc['input_ids'],
    'attention_mask' : train_enc['attention_mask'],
    'labels': list(train_labels)
})
test_dataset = Dataset.from_dict({
    'input_ids' : test_enc['input_ids'],
    'attention_mask' : test_enc['attention_mask'],
    'labels': list(test_labels)
})


## Fine-tuning a Pre-trained Language Model for Classification
Fine-tune a pre-trained language model for classification on the given dataset.

Define the model using the `AutoModelForSequenceClassification` class.

In [10]:
# Write your code here. Add as many boxes as you need.
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Define the traning parameters using the `TrainingArguments` class.

In [11]:
# Write your code here. Add as many boxes as you need.
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size= 16,
    per_device_eval_batch_size= 16,
    num_train_epochs= 3,
    weight_decay= 0.01,
    fp16=True,
)



In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

Define the training using the `Trainer` class.

In [13]:
# Write your code here. Add as many boxes as you need.
trainer = Trainer(
    model=model, 
    args= training_args,
    train_dataset=train_dataset.shuffle().select(range(100)),
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)

Fine-tune (train) the pre-trained lanugage model.

In [14]:
# Write your code here. Add as many boxes as you need.
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.692214
2,No log,0.688935
3,No log,0.685


TrainOutput(global_step=21, training_loss=0.6793074834914434, metrics={'train_runtime': 114.0221, 'train_samples_per_second': 2.631, 'train_steps_per_second': 0.184, 'total_flos': 12263583391200.0, 'train_loss': 0.6793074834914434, 'epoch': 3.0})

Use the trained model to make predictions for the test set.

In [15]:
# Write your code here. Add as many boxes as you need.
pred = trainer.predict(test_dataset)
y_pred = np.argmax(pred.predictions, axis=1)

Assess the performance of the model by using different metrics provided by the `scikit-learn` library.

In [17]:
# Write your code here. Add as many boxes as you need.
accuracy = accuracy_score(test_labels, y_pred)
precision = precision_score(test_labels, y_pred)
recall = recall_score(test_labels, y_pred)
f1 = f1_score(test_labels, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.4960
Precision: 0.4955
Recall: 1.0000
F1 Score: 0.6627


# Laboratory Exercise - Bonus Task (+ 2 points)

Implement a simple machine learning pipeline to classify wine reviews as **exceptional** or not. Use TF-IDF vectorization to convert text into numerical features and train a logistic regression. Split the dataset into training and testing sets, fit the pipeline on the training data, and evaluate its performance using metrics such as precision, recall, and F1-score. Analyze the texts to find the most influential words or phrases associated with the **exceptional** wines. Use the coefficients from the logistic regression trained on TF-IDF features to identify the top positive and negative keywords for **exceptional** wines. Present these keywords in a simple table or visualization (e.g., bar chart).

In [18]:
# Write your code here. Add as many boxes as you need.
tvec = TfidfVectorizer(max_features=1000)
X_train = tvec.fit_transform(train_texts)
X_test = tvec.transform(test_texts)

In [19]:
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, train_labels)

In [20]:
tvec_pred = log_model.predict(X_test)

In [21]:
accuracy = accuracy_score(test_labels, tvec_pred)
precision = precision_score(test_labels, tvec_pred)
recall = recall_score(test_labels, tvec_pred)
f1 = f1_score(test_labels, tvec_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.7925
Precision: 0.7866
Recall: 0.7970
F1 Score: 0.7918
