**Here we perform the training loops**

In [1]:
from data_frame_manager.data_frame_manager import DataFrameManager
import numpy as np
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
# import xgboost as xgb
import pandas as pd


DATASET_COLUMNS = ["text", "label"]#["target", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"
MODEL_NAME = 'roberta'
NUM_CLASSES = 3


In [2]:
dataFrameManage = DataFrameManager()

train_df = dataFrameManage.load_dataframe(filepath="Data/twitter-datasets/preprocessed/train_preprocessed.csv", encoding=DATASET_ENCODING, preprocess=False)
test_df = dataFrameManage.load_dataframe(filepath="Data/twitter-datasets/preprocessed/test_preprocessed.csv", encoding=DATASET_ENCODING, preprocess=False)

encode_map = {"NEGATIVE" : 0, "POSITIVE" : 1}


train_labels = train_df["target"].map(encode_map).to_list()
test_labels = test_df["target"].map(encode_map).to_list()

# CountVectorizer

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

# We only keep the 5000 most frequent words, both to reduce the computational cost and reduce overfitting
vectorizer = CountVectorizer(max_features=5000)

# Important: we call fit_transform on the training set, and only transform on the validation set
X_train = vectorizer.fit_transform(train_df["text"].to_list())
X_val = vectorizer.transform(test_df["text"].to_list())

Y_train = train_labels
Y_val = test_labels

In [7]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(C=1e5, max_iter=100)
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
Y_train_pred = model.predict(X_train)
Y_val_pred = model.predict(X_val)

train_accuracy = accuracy_score(Y_train_pred, Y_train)
val_accuracy = accuracy_score(Y_val_pred, Y_val)
print(f'Accuracy (training set): {train_accuracy:.05f}')
print(f'Accuracy (validation set): {val_accuracy:.05f}')

Accuracy (training set): 0.79842
Accuracy (validation set): 0.79531


In [9]:
import torch
from transformers import AutoTokenizer, AutoModel

print("Loading model")
# Load the BERTweet tokenizer
model = AutoModel.from_pretrained("vinai/bertweet-base")

tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")
print("Model loaded")


Loading model


Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Model loaded


In [10]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

task = 'sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)


In [6]:
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW

In [22]:


# Load the pre-trained model and tokenizer
task = 'sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

# Prepare your dataset
# ... (Load and preprocess your dataset, assign labels, etc.)

# Split the dataset into training and evaluation sets
train_texts, eval_texts, train_lbls, eval_lbls = train_df["text"][:10000].to_list(), test_df["text"][:1000].to_list(), train_labels[:10000], test_labels[:1000]

# Tokenize the texts
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
eval_encodings = tokenizer(eval_texts, truncation=True, padding=True)


# Convert the dataset to PyTorch tensors
train_dataset = torch.utils.data.TensorDataset(
    torch.tensor(train_encodings["input_ids"]),
    torch.tensor(train_encodings["attention_mask"]),
    torch.tensor(train_lbls)
)
eval_dataset = torch.utils.data.TensorDataset(
    torch.tensor(eval_encodings["input_ids"]),
    torch.tensor(eval_encodings["attention_mask"]),
    torch.tensor(eval_lbls)
)

# Define the optimizer and learning rate
optimizer = AdamW(model.parameters(), lr=1e-5)

# Fine-tuning loop
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = "mps"
model.to(device)
model.train()

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)

for epoch in tqdm(range(3)):  # Adjust the number of epochs as needed
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Evaluation
model.eval()

eval_loader = torch.utils.data.DataLoader(eval_dataset, batch_size=16)

with torch.no_grad():
    correct = 0
    total = 0

    for batch in eval_loader:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_labels = torch.argmax(logits, dim=1)

        total += labels.size(0)
        correct += (predicted_labels == labels).sum().item()

accuracy = correct / total
print("Evaluation Accuracy:", accuracy)

# Save the fine-tuned model
model.save_pretrained("twitter-roberta-base-sentiment")


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 3/3 [09:35<00:00, 191.93s/it]


Evaluation Accuracy: 0.857


In [11]:
task = 'sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained("twitter-roberta-base-sentiment")
device = "mps"
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [12]:
val_df = dataFrameManage.load_dataframe(filepath="Data/twitter-datasets/preprocessed/test_data_preprocessed.csv", encoding=DATASET_ENCODING, preprocess=False, test = True)
eval_encodings = tokenizer(val_df["text"].to_list(), truncation=True, padding=True)
eval_dataset = torch.utils.data.TensorDataset(
    torch.tensor(eval_encodings["input_ids"]),
    torch.tensor(eval_encodings["attention_mask"])
)
eval_loader = torch.utils.data.DataLoader(eval_dataset, batch_size=32)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [13]:
preds = []
model.eval()
with torch.no_grad():

    for batch in eval_loader:
        input_ids, attention_mask = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_labels = torch.argmax(logits, dim=1)
        preds.append(predicted_labels)

preds = torch.cat(preds, dim=0)

preds_array = preds.cpu().numpy()

print(preds_array)



[0 0 1 ... 0 1 0]


In [14]:
mapped_array = np.where(preds_array == 0, -1, 1)

print(mapped_array)

[-1 -1  1 ... -1  1 -1]


In [16]:
predict_df = pd.DataFrame(mapped_array, columns=["Prediction"], index=pd.Index(range(1, len(mapped_array)+1), start=1, name="Id"))
predict_df.to_csv(f"Data/twitter-datasets/predictions_twitter-roberta-base-sentiment.csv", index=True, index_label="Id")

  predict_df = pd.DataFrame(mapped_array, columns=["Prediction"], index=pd.Index(range(1, len(mapped_array)+1), start=1, name="Id"))
