In [None]:
!pip install transformers


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [None]:

data = pd.read_csv(r'/content/output2_clean.csv', encoding='ISO-8859-1')

In [None]:
# 1
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=13)
model = model.to('cuda')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'classifier.w

In [None]:
# Split data into train and validation sets
X = list(data["content"].astype(str))
y = list(data["sentiment"].astype(int))
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y)

# Tokenize data and create torch datasets
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [None]:
train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    pred = np.argmax(pred.predictions, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average='macro')
    precision = precision_score(y_true=labels, y_pred=pred, average='macro')
    f1 = f1_score(y_true=labels, y_pred=pred, average='macro')

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}



In [None]:
args = TrainingArguments(
    output_dir="output",
    num_train_epochs=3,
    per_device_train_batch_size=8
)

In [None]:
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=None,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()



Step,Training Loss
500,2.0387
1000,1.9452
1500,1.896
2000,1.8902
2500,1.8876
3000,1.8946
3500,1.8661
4000,1.8594
4500,1.7306
5000,1.7311


TrainOutput(global_step=12000, training_loss=1.671778818766276, metrics={'train_runtime': 857.5967, 'train_samples_per_second': 111.941, 'train_steps_per_second': 13.993, 'total_flos': 1689290654976000.0, 'train_loss': 1.671778818766276, 'epoch': 3.0})

In [None]:
result = trainer.evaluate()
print(result)

{'eval_loss': 2.0295255184173584, 'eval_accuracy': 0.350375, 'eval_precision': 0.24361834441026264, 'eval_recall': 0.203769803929695, 'eval_f1': 0.20265824546258332, 'eval_runtime': 15.1936, 'eval_samples_per_second': 526.538, 'eval_steps_per_second': 65.817, 'epoch': 3.0}


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
model_save_path = 'Dataset_epoc3_35.pt'
torch.save(model.state_dict(), model_save_path)

In [None]:
model_save_path = '/content/Dataset_epoc3_35.pt'
model.load_state_dict(torch.load(model_save_path))

<All keys matched successfully>

In [None]:

data = pd.read_csv(r'/content/output2_clean.csv', encoding='ISO-8859-1')

In [None]:
model = model.to('cuda')
# Split data into train and validation sets
X = list(data["content"].astype(str))
y = list(data["sentiment"].astype(int))
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y)

# Tokenize data and create torch datasets
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])
    
train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

def compute_metrics(pred):
    labels = pred.label_ids
    pred = np.argmax(pred.predictions, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average='macro')
    precision = precision_score(y_true=labels, y_pred=pred, average='macro')
    f1 = f1_score(y_true=labels, y_pred=pred, average='macro')

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

def compute_metrics(pred):
    labels = pred.label_ids
    pred = np.argmax(pred.predictions, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average='macro')
    precision = precision_score(y_true=labels, y_pred=pred, average='macro')
    f1 = f1_score(y_true=labels, y_pred=pred, average='macro')

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

def compute_metrics(pred):
    labels = pred.label_ids
    pred = np.argmax(pred.predictions, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average='macro')
    precision = precision_score(y_true=labels, y_pred=pred, average='macro')
    f1 = f1_score(y_true=labels, y_pred=pred, average='macro')

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

args = TrainingArguments(
    output_dir="output",
    num_train_epochs=3,
    per_device_train_batch_size=4
)
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=None,
    compute_metrics=compute_metrics
)
trainer.train()




Step,Training Loss
500,1.4647
1000,1.5078
1500,1.563
2000,1.5098
2500,1.512
3000,1.4878
3500,1.5094
4000,1.5375
4500,1.547
5000,1.5678


TrainOutput(global_step=24000, training_loss=1.1400295372009277, metrics={'train_runtime': 1311.2428, 'train_samples_per_second': 73.213, 'train_steps_per_second': 18.303, 'total_flos': 1689290654976000.0, 'train_loss': 1.1400295372009277, 'epoch': 3.0})

In [None]:
result = trainer.evaluate()
print(result)

{'eval_loss': 3.53031849861145, 'eval_accuracy': 0.308375, 'eval_precision': 0.19637650841771168, 'eval_recall': 0.18740240746216819, 'eval_f1': 0.18968041590990006, 'eval_runtime': 15.2723, 'eval_samples_per_second': 523.824, 'eval_steps_per_second': 65.478, 'epoch': 3.0}


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
model_save_path = 'Dataset_3epochs_56%.pt'
torch.save(model.state_dict(), model_save_path)