In [23]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.model_selection import ParameterGrid
from transformers import DataCollatorWithPadding
import evaluate
import warnings
import numpy as np
warnings.filterwarnings("ignore")

In [2]:
#pip install evaluate

In [3]:
# Read the TSV file
df = pd.read_csv("a3_first_sample.tsv", sep="\t", header=None)
data = [(row[0], row[1]) for _, row in df.iterrows()]

In [4]:
converted_data = []

for entry in data:
    label, text = entry
    converted_entry = {"label": label, "text": text}
    converted_data.append(converted_entry)

In [5]:
converted_data[1]

{'label': 1, 'text': 'Everybody that does not take the vaccine will die'}

In [6]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [7]:
# Define the preprocessing function
def preprocess_function(entry):
    tokenized_entry = tokenizer(entry["text"], truncation=True)
    return tokenized_entry

In [8]:
tokenized_data = [preprocess_function(entry) for entry in converted_data]

In [9]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [10]:
accuracy = evaluate.load("accuracy")

In [20]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    return accuracy.compute(predictions=predictions, references=labels)

In [12]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}

label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [13]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.we

In [17]:
training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

In [18]:
#pip install --upgrade accelerate

In [21]:
train_size = int(0.8 * len(tokenized_data))
train_data, test_data = tokenized_data[:train_size], tokenized_data[train_size:]

In [26]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
trainer.train()

TypeError: list indices must be integers or slices, not str