In [None]:
!pip install emoji

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, ParameterGrid
import matplotlib.pyplot as plt
import seaborn as sns
import emoji
import re
import sklearn
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
import copy
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag
from torch import nn
import torch
from transformers import AutoConfig, AutoModel, AutoTokenizer, AdamW

from google.colab import drive
drive.mount('/content/drive')

# Creazione Dataset
Caricamento dei dati di train per creare il dataset "Data" contenenete le colonne "text" e "labels"

In [None]:
#Creazione dataset, la prima colonna è il testo intero del tweet
#La seconda colonna sono le etichette
#0 = Rabbia
#1 = Gioia
#2 = Ottimismo
#3 = Tristezza

datas = "/content/drive/MyDrive/NLP/Challenge_2024/emotion/train_text.txt"
with open(datas, 'r', encoding='utf-8') as f:
    tweets = f.readlines()

tweets = [tweet.strip() for tweet in tweets]

data = pd.DataFrame(tweets, columns=['text'])

labels ="/content/drive/MyDrive/NLP/Challenge_2024/emotion/train_labels.txt"
with open(labels, 'r', encoding='utf-8') as f:
    labels = f.readlines()

labels2 = [int(label.strip()) for label in labels]

data.insert(1, "label", labels2)
texts = pd.DataFrame(data, columns=["text"])
data.head()

In [None]:
data.info()

Analizziamo l'equilibrio dei dati

In [None]:
print('samples -> '+str(len(data)))
labels = set(data['label'])
numLabels = len(labels)
print('categories -> '+str(labels)+'['+str(len(labels))+']')
print(data['label'].value_counts())
data.groupby(['label']).size().plot.bar()

# Preprocessing dei dati

Eseguiamo un Preprocessing, facendo features extraction.
Troviamo le Features:
- emoji count
- Aggettivi
- Verbi
- Sostantivi
- Avverbi
- Word Count
- Important Word Count
- Lunghezza del testo

In [None]:
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

data['emoji_count'] = data['text'].apply(lambda text: sum(char in emoji.EMOJI_DATA.keys() for char in text))

data['text'] = data['text'].apply(lambda text: emoji.demojize(text))

text = "This is a test sentence."
tokens = word_tokenize(text)
print("Tokenized:", tokens)

tags = pos_tag(tokens)
print("POS Tags:", tags)

def extract_pos_counts(text):
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)
    pos_counts = {
        'JJ': 0,
        'VB': 0,
        'NN': 0,
        'RB': 0
    }
    for _, tag in tagged:
        if tag.startswith('JJ'):
            pos_counts['JJ'] += 1
        elif tag.startswith('VB'):
            pos_counts['VB'] += 1
        elif tag.startswith('NN'):
            pos_counts['NN'] += 1
        elif tag.startswith('RB'):
            pos_counts['RB'] += 1
    return pos_counts

data['pos_counts'] = data['text'].apply(extract_pos_counts)
df_pos = data['pos_counts'].apply(pd.Series)
data = pd.concat([data, df_pos], axis=1)

print(data.columns)

correlation_matrix = data[['JJ', 'VB', 'NN', 'RB', 'label']].corr()
print(correlation_matrix)

df_pos = data['pos_counts'].apply(pd.Series)

df = data.drop(columns=['JJ', 'VB', 'NN', 'RB'], errors='ignore')
df = pd.concat([data, df_pos], axis=1)

correlation_matrix = data[['JJ', 'VB', 'NN', 'RB', 'label']].corr()

pos_labels = {
    'JJ': 'Adjective (Aggettivi)',
    'VB': 'Verb (Verbi)',
    'NN': 'Noun (Sostantivi)',
    'RB': 'Adverb (Avverbi)',
    'label': 'Target'
}

correlation_matrix.rename(columns=pos_labels, index=pos_labels, inplace=True)

plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Matrice di Correlazione tra PoS e Label")
plt.show()



In [None]:
data['word_count'] = data['text'].apply(lambda x: len(x.split()))
data['important_word_count'] = data['JJ'] + data['NN'] + data['RB'] + data['VB']
data['text_length'] = data['text'].apply(len)
data

In [None]:
data['hashtags'] = data['text'].apply(lambda x: re.findall(r"#\w+", x.lower()))

hashtags_exploded = data.explode('hashtags').dropna(subset=['hashtags'])

hashtag_counts = (
    hashtags_exploded.groupby(['hashtags', 'label'])
    .size()
    .unstack(fill_value=0)
    .rename(columns={0: 'count_anger', 1: 'count_joy', 2: 'count_opt', 3: 'count_sad'})
)

hashtag_counts['count_total'] = hashtag_counts['count_anger'] + hashtag_counts['count_joy'] + hashtag_counts['count_opt'] + hashtag_counts['count_sad']
hashtag_counts['anger_ratio'] = hashtag_counts['count_anger'] / hashtag_counts['count_total']
hashtag_counts['joy_ratio'] = hashtag_counts['count_joy'] / hashtag_counts['count_total']
hashtag_counts['opt_ratio'] = hashtag_counts['count_opt'] / hashtag_counts['count_total']
hashtag_counts['sad_ratio'] = hashtag_counts['count_sad'] / hashtag_counts['count_total']

hashtag_counts = hashtag_counts.sort_values(by='count_total', ascending=False)

print(hashtag_counts)

significant_hashtags = hashtag_counts[hashtag_counts['count_total'] > 3]
print(significant_hashtags)

Eseguiamo anche una Features Extraction analizzando gli hashtag e estraendo
- has_anger_hashtag
- has_joy_hashtag
- has_opt_hashtag
- has_sad_hashtag

In [None]:
data['has_anger_hashtag'] = data['hashtags'].apply(
    lambda x: 1 if any(ht in significant_hashtags[significant_hashtags['anger_ratio'] > 0.6].index for ht in x) else 0
)
data['has_joy_hashtag'] = data['hashtags'].apply(
    lambda x: 1 if any(ht in significant_hashtags[significant_hashtags['joy_ratio'] < 0.4].index for ht in x) else 0
)
data['has_opt_hashtag'] = data['hashtags'].apply(
    lambda x: 1 if any(ht in significant_hashtags[significant_hashtags['opt_ratio'] > 0.6].index for ht in x) else 0
)
data['has_sad_hashtag'] = data['hashtags'].apply(
    lambda x: 1 if any(ht in significant_hashtags[significant_hashtags['sad_ratio'] < 0.4].index for ht in x) else 0
)

In [None]:
data

Calcoliamo la matrice di correlazione delle Features numeriche

In [None]:
labels = data["label"]
data2 = data.drop("text", axis=1)
data2.drop("pos_counts", axis=1, inplace=True)
data2.drop("hashtags", axis=1, inplace=True)

In [None]:
plt.figure(figsize=(10, 6))
sns.heatmap(data2.corr(), cmap="coolwarm", annot=True)

# Features Selection
Eseguiamo una Features Selection utilizzando RandomForestClassifier come modello e otteniamo le seguenti Features:
- VB
- NN
- word_count
- important_word_count
- text_length

In [None]:
data2.drop("label", axis=1, inplace=True)
featureSel = SelectFromModel(estimator=RandomForestClassifier(n_estimators=100, n_jobs=-1), max_features=5)
data3 = featureSel.fit_transform(data2, labels)
print(f"Features tenute: {featureSel.get_feature_names_out()}")

In [None]:
data3 = pd.DataFrame(data3, columns=["VB", "NN", "word_count", "important_word_count", "text_length"])
data3.insert(0, "text", data["text"])
data3.insert(6, "label",labels)
data3

# Caricamento Dataset Validation
Eseguiamo le stesse procedure di Features Extraction sul validation solo per quelle selezionate in fase di Features Selection


In [None]:
x_train = data3['text']
y_train = data3['label']
VB = data3['VB']
NN = data3['NN']
word_count = data3['word_count']
important_word_count = data3['important_word_count']
text_length = data3['text_length']

with open("/content/drive/MyDrive/NLP/Challenge_2024/emotion/val_text.txt", 'r', encoding='utf-8') as f:
    tweets = f.readlines()

tweets = [tweet.strip() for tweet in tweets]
val_text_df = pd.DataFrame(tweets, columns=['text'])

with open("/content/drive/MyDrive/NLP/Challenge_2024/emotion/val_labels.txt", 'r', encoding='utf-8') as f:
    labels = f.readlines()

labels2 = [int(label.strip()) for label in labels]
val_text_df['pos_counts'] = val_text_df['text'].apply(extract_pos_counts)
df_pos = val_text_df['pos_counts'].apply(pd.Series)
val_text_df = pd.concat([val_text_df, df_pos], axis=1)
val_text_df['word_count'] = val_text_df['text'].apply(lambda x: len(x.split()))
val_text_df['important_word_count'] = val_text_df['JJ'] + val_text_df['NN'] + val_text_df['RB'] + val_text_df['VB']
val_text_df['text_length'] = val_text_df['text'].apply(len)
val_text_df.drop(["pos_counts", "JJ", "RB"], axis=1, inplace=True)
val_text_df.insert(6, "label", labels2)
val_text_df

In [None]:
x_val = val_text_df['text']
VB_val = val_text_df['VB']
NN_val = val_text_df['NN']
word_count_val = val_text_df['word_count']
important_word_count_val = val_text_df['important_word_count']
text_length_val = val_text_df['text_length']
label_val = val_text_df['label']

In [None]:
x_train = x_train.apply(lambda x: x.replace('\r', ' ').replace('\n', ' '))
x_val = x_val.apply(lambda x: x.replace('\r', ' ').replace('\n', ' '))

# Creazione e Addestramento del modello
Creiamo una classe Dataset e una classe ClassifierDeep per inizializzare una rete neurale e i nostri Dataset di Train e Validation.

In [None]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

class Dataset(torch.utils.data.Dataset):

    def __init__(self, x, VB, NN, word_count, important_word_count, text_length, y, stopwords):

        tokens_litt = [nltk.word_tokenize(text, language='english')
         for text in list(x)]
        text_clean = []

        if stopwords:
            for sentence in tqdm(tokens_litt, desc='Tokenizing ... '):
                text_clean.append(' '.join([w.lower() for w in sentence if
                    not w.lower() in nltk.corpus.stopwords.words("english")]))
        else:
            for sentence in tqdm(tokens_litt, desc='Tokenizing ... '):
                text_clean.append(' '.join([w.lower() for w in sentence]))

        self.texts = text_clean
        self.labels = [torch.tensor(label) for label in y]
        self.VB = [torch.tensor(vb) for vb in VB]
        self.NN = [torch.tensor(nn) for nn in NN]
        self.word_count = [torch.tensor(word) for word in word_count]
        self.important_word_count = [torch.tensor(impword) for impword in important_word_count]
        self.text_length = [torch.tensor(l) for l in text_length]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):

        batch_texts = self.texts[idx]
        batch_labels = np.array(self.labels[idx])
        batch_VB = np.array(self.VB[idx])
        batch_NN = np.array(self.NN[idx])
        batch_word_count = np.array(self.word_count[idx])
        batch_important_word_count = np.array(self.important_word_count[idx])
        batch_text_length = np.array(self.text_length[idx])

        return batch_texts, batch_VB, batch_NN, batch_word_count, batch_important_word_count, batch_text_length, batch_labels

In [None]:
hyperparameters = {
    "epochs": 5,
    "learning_rate": 1e-5,
    "batch_size": 16,
    "dropout": 0.1,
    "stopwords": False,
    "language_model": "vinai/bertweet-base",
    "layers": 1,
    "h_dim": 768,
    "bilstm": True,
    "patience": 5,
    "min_delta": 0.01
}

In [None]:
train_dataset = Dataset(x_train, VB, NN, word_count, important_word_count, text_length,y_train, hyperparameters["stopwords"])
val_dataset = Dataset(x_val, VB_val, NN_val, word_count_val, important_word_count_val, text_length_val ,label_val, hyperparameters["stopwords"])

In [None]:
extra_features = 5

class ClassifierDeep(nn.Module):

    def __init__(self, labels, hdim, dropout, model_name):
        super(ClassifierDeep, self).__init__()
        config = AutoConfig.from_pretrained(model_name)
        self.lm_model = AutoModel.from_pretrained(model_name, config=config)
        self.classifier = nn.Sequential(
            nn.Linear(hdim + extra_features, hdim),
            nn.BatchNorm1d(hdim),
            nn.Dropout(dropout),
            nn.ReLU(),
            nn.Linear(hdim, hdim),
            nn.Dropout(dropout),
            nn.ReLU(),
            nn.Linear(hdim, labels)
            )

    def forward(self, input_id_text, attention_mask, VB, NN, word_count, important_word_count, text_length):

        output = self.lm_model(input_id_text, attention_mask).last_hidden_state
        output = output[:,0,:]
        output = torch.cat((output, VB.unsqueeze(1), NN.unsqueeze(1), word_count.unsqueeze(1), important_word_count.unsqueeze(1), text_length.unsqueeze(1)), dim=1)
        return self.classifier(output)

In [None]:
class EarlyStopping:
    def __init__(self, patience=5, min_delta=0.0):

        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.early_stop = False
        self.min_validation_loss = torch.inf

    def __call__(self, validation_loss):

        if (validation_loss + self.min_delta) >= self.min_validation_loss:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
                print("Early stop!")
        else:
            self.min_validation_loss = validation_loss
            self.counter = 0

In [None]:
def train_loop(model, dataloader, tokenizer, loss, optimizer, device, scheduler):
    model.train()

    epoch_acc = 0
    epoch_loss = 0

    for batch_texts, batch_VB, batch_NN, batch_word_count, batch_important_word_count, batch_text_length,batch_labels in tqdm(dataloader, desc='training set'):

        optimizer.zero_grad()

        tokens = tokenizer(list(batch_texts), add_special_tokens=True, return_tensors='pt', padding='max_length', max_length = 128, truncation=True)
        input_id_texts = tokens['input_ids'].squeeze(1).to(device)
        batch_VB = batch_VB.float().to(device)
        batch_NN = batch_NN.float().to(device)
        batch_word_count = batch_word_count.float().to(device)
        batch_important_word_count = batch_important_word_count.float().to(device)
        batch_text_length = batch_text_length.float().to(device)
        mask_texts = tokens['attention_mask'].squeeze(1).to(device)
        batch_labels = batch_labels.to(device)
        output = model(input_id_texts, mask_texts, batch_VB, batch_NN, batch_word_count, batch_important_word_count, batch_text_length).squeeze(1)

        batch_loss = loss(output, batch_labels)
        batch_loss.backward()
        optimizer.step()
        scheduler.step()
        epoch_loss += batch_loss.item()

        preds = output.argmax(dim=1)
        epoch_acc += (preds == batch_labels).sum().item()

        batch_labels = batch_labels.detach().cpu()
        input_id_texts = input_id_texts.detach().cpu()
        mask_texts = mask_texts.detach().cpu()
        batch_VB = batch_VB.detach().cpu()
        batch_NN = batch_NN.detach().cpu()
        batch_word_count = batch_word_count.detach().cpu()
        batch_important_word_count = batch_important_word_count.detach().cpu()
        batch_text_length = batch_text_length.detach().cpu()
        output = output.detach().cpu()

    return epoch_loss / len(dataloader), epoch_acc

In [None]:
def test_loop(model, dataloader, tokenizer, loss, device):
    model.eval()

    epoch_acc = 0
    epoch_loss = 0

    all_preds = []
    all_labels = []

    with torch.no_grad():

        for batch_texts, batch_VB, batch_NN, batch_word_count, batch_important_word_count, batch_text_length,batch_labels in tqdm(dataloader, desc='dev set'):

            tokens = tokenizer(list(batch_texts), add_special_tokens=True, return_tensors='pt', padding='max_length', max_length = 128, truncation=True)
            input_id_texts = tokens['input_ids'].squeeze(1).to(device)
            batch_VB = batch_VB.float().to(device)
            batch_NN = batch_NN.float().to(device)
            batch_word_count = batch_word_count.float().to(device)
            batch_important_word_count = batch_important_word_count.float().to(device)
            batch_text_length = batch_text_length.float().to(device)
            mask_texts = tokens['attention_mask'].squeeze(1).to(device)
            batch_labels = batch_labels.to(device)
            output = model(input_id_texts, mask_texts, batch_VB, batch_NN, batch_word_count, batch_important_word_count, batch_text_length).squeeze(1)

            batch_loss = loss(output, batch_labels)
            epoch_loss += batch_loss.item()

            preds = output.argmax(dim=1)
            epoch_acc += (preds == batch_labels).sum().item()

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(batch_labels.cpu().numpy())

            batch_labels = batch_labels.detach().cpu()
            input_id_texts = input_id_texts.detach().cpu()
            mask_texts = mask_texts.detach().cpu()
            batch_VB = batch_VB.detach().cpu()
            batch_NN = batch_NN.detach().cpu()
            batch_word_count = batch_word_count.detach().cpu()
            batch_important_word_count = batch_important_word_count.detach().cpu()
            batch_text_length = batch_text_length.detach().cpu()
            output = output.detach().cpu()

    f1 = f1_score(all_labels, all_preds, average="macro")

    return epoch_loss / len(dataloader), epoch_acc, f1

In [None]:
def train_test(model, epochs, optimizer, device, train_data,
               batch_size, model_name, train_loss_fn,
               test_loss_fn=None,
               early_stopping=None,
               val_data=None,
               scheduler=None):

    train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val_data, batch_size=batch_size)

    if test_loss_fn == None:
        test_loss_fn = train_loss_fn

    train_loss = []
    validation_loss = []

    train_acc = []
    validation_acc = []

    preds = []

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    for epoch in tqdm(range(1,epochs+1)):

        epoch_train_loss, epoch_train_acc = train_loop(model, train_dataloader, tokenizer, train_loss_fn, optimizer, device, scheduler)
        train_loss.append(epoch_train_loss)
        train_acc.append(epoch_train_acc/len(train_data))

        if early_stopping != None:
                epoch_validate_loss, epoch_validate_acc, f1 = test_loop(model, val_dataloader, tokenizer, test_loss_fn, device)
                validation_loss.append(epoch_validate_loss)
                validation_acc.append(epoch_validate_acc/len(val_data))

        val_loss_str = f'Validation loss: {epoch_validate_loss:6.4f} 'if early_stopping != None else ' '
        val_acc_str = f'Validation accuracy: {(epoch_validate_acc/len(val_data)):6.4f} ' if early_stopping != None else ' '
        print(f"\nF1 score: {f1:.4f}")
        print(f"\nTrain loss: {epoch_train_loss:6.4f} ---- {val_loss_str}")
        print(f"Train accuracy: {(epoch_train_acc/len(train_data)):6.4f} ---- {val_acc_str}")

        if early_stopping != None:
                early_stopping(epoch_validate_loss)
                if early_stopping.early_stop:
                    break

    return train_loss, validation_loss, train_acc, validation_acc, f1

Eseguiamo una Parameter Grid per trovare i migliori Iper-Parametri del modello. I commenti indicano gli Iper-parametri scartati.

In [None]:
param_grid = {
    "epochs":[50], # [10,35,50]
    "learning_rate": [1e-4], # [1e-3,1e-4,1e-5]
    "batch_size": [32], # [16,32]
    "dropout": [0.2], # [0.1,0.2]
    "weight_decay": [0.0001] # [0.0001,0.001]
}

best_f1 = 0
best_train_loss = 0
best_val_loss = 0
best_train_acc = 0
best_val_acc = 0
best_params={}
f1_value=0

grid = ParameterGrid(param_grid)
for params in grid:
  print("----------------------------------------------------------------")
  print(params)
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  print(f"Using {device} device")

  model = ClassifierDeep(numLabels,
                      hyperparameters["h_dim"],
                      params["dropout"],
                      hyperparameters["language_model"]).to(device)

  print(model)

  total_params = sum(p.numel() for p in model.parameters())
  print(f"Numero totale dei parametri: {total_params}")

  criterion = nn.CrossEntropyLoss()
  optimizer = AdamW(model.parameters(), lr=params["learning_rate"], weight_decay=params["weight_decay"])
  scheduler = torch.optim.lr_scheduler.LinearLR(optimizer=optimizer, start_factor=1, end_factor=0.1, total_iters=5)

  early_stopping = EarlyStopping(patience=hyperparameters['patience'], min_delta=hyperparameters['min_delta'])

  train_loss, validation_loss, train_acc, validation_acc, f1_value = train_test(model,
                                                                                params['epochs'],
                                                                                optimizer, device, train_dataset,
                                                                                params['batch_size'], hyperparameters['language_model'],
                                                                                criterion, criterion, early_stopping,val_dataset, scheduler=scheduler)

  if f1_value > best_f1:
    best_f1 = f1_value
    best_params = copy.deepcopy(params)
    best_train_loss = train_loss
    best_val_loss = validation_loss
    best_train_acc = train_acc
    best_val_acc = validation_acc

torch.cuda.empty_cache()

In [None]:
print("Parametri best F1:\n")
print(f"F1 score: {best_f1:.4f}\n")
print(f"Iperparametri migliori: {params}\n")

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(20, 10))

axs[0].plot(best_train_loss, label='training loss')
axs[0].plot(best_val_loss, label='validation loss')
axs[0].legend(loc='upper right')
axs[0].set_ylim(0,1)

axs[1].plot(best_train_acc, label='training accuracy')
axs[1].plot(best_val_acc, label='validation accuracy')
axs[1].legend(loc='lower right')
axs[1].set_ylim(0,1)

In [None]:
torch.save(model, "modelloClassificazione.pth")
torch.save(model.state_dict(), "pesiClassificazione.pth")

# Caricamento dataset di Test e calcolo predizioni
Eseguiamo gli stessi passaggi riportati sopra, al fine di ottenere le predizioni sul Test da inviare alla piattaforma


In [None]:
datas = "/content/drive/MyDrive/NLP/Challenge_2024/emotion/test_text.txt"
with open(datas, 'r', encoding='utf-8') as f:
    tweets = f.readlines()

tweets = [tweet.strip() for tweet in tweets]

testdata = pd.DataFrame(tweets, columns=['text'])

testdata

In [None]:
testdata['pos_counts'] = testdata['text'].apply(extract_pos_counts)
df_pos = testdata['pos_counts'].apply(pd.Series)
testdata = pd.concat([testdata, df_pos], axis=1)
testdata['word_count'] = testdata['text'].apply(lambda x: len(x.split()))
testdata['important_word_count'] = testdata['JJ'] + testdata['NN'] + testdata['RB'] + testdata['VB']
testdata['text_length'] = testdata['text'].apply(len)
testdata['text'] = testdata['text'].apply(lambda text: emoji.demojize(text))
testdata.drop(["pos_counts", "JJ", "RB"], axis=1, inplace=True)
testdata

In [None]:
class TestDataset(torch.utils.data.Dataset):

    def __init__(self, x, VB, NN, word_count, important_word_count, text_length, stopwords):

        tokens_litt = [nltk.word_tokenize(text, language='english')
         for text in list(x)]
        text_clean = []

        if stopwords:
            for sentence in tqdm(tokens_litt, desc='Tokenizing ... '):
                text_clean.append(' '.join([w.lower() for w in sentence if
                    not w.lower() in nltk.corpus.stopwords.words("english")]))
        else:
            for sentence in tqdm(tokens_litt, desc='Tokenizing ... '):
                text_clean.append(' '.join([w.lower() for w in sentence]))

        self.texts = text_clean
        self.VB = [torch.tensor(vb) for vb in VB]
        self.NN = [torch.tensor(nn) for nn in NN]
        self.word_count = [torch.tensor(word) for word in word_count]
        self.important_word_count = [torch.tensor(impword) for impword in important_word_count]
        self.text_length = [torch.tensor(l) for l in text_length]

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):

        batch_texts = self.texts[idx]
        batch_VB = np.array(self.VB[idx])
        batch_NN = np.array(self.NN[idx])
        batch_word_count = np.array(self.word_count[idx])
        batch_important_word_count = np.array(self.important_word_count[idx])
        batch_text_length = np.array(self.text_length[idx])

        return batch_texts, batch_VB, batch_NN, batch_word_count, batch_important_word_count, batch_text_length

In [None]:
def get_predictions(model, dataloader, tokenizer, device):
    model.eval()

    all_preds = []
    all_labels = []

    with torch.no_grad():

        for batch_texts, batch_VB, batch_NN, batch_word_count, batch_important_word_count, batch_text_length in tqdm(dataloader, desc='dev set'):

            tokens = tokenizer(list(batch_texts), add_special_tokens=True, return_tensors='pt', padding='max_length', max_length = 128, truncation=True)
            input_id_texts = tokens['input_ids'].squeeze(1).to(device)
            batch_VB = batch_VB.float().to(device)
            batch_NN = batch_NN.float().to(device)
            batch_word_count = batch_word_count.float().to(device)
            batch_important_word_count = batch_important_word_count.float().to(device)
            batch_text_length = batch_text_length.float().to(device)
            mask_texts = tokens['attention_mask'].squeeze(1).to(device)
            output = model(input_id_texts, mask_texts, batch_VB, batch_NN, batch_word_count, batch_important_word_count, batch_text_length).squeeze(1)

            y_pred = output.argmax(dim=1)

            all_preds.extend(y_pred.cpu().numpy())
            input_id_texts = input_id_texts.detach().cpu()
            mask_texts = mask_texts.detach().cpu()
            batch_VB = batch_VB.detach().cpu()
            batch_NN = batch_NN.detach().cpu()
            batch_word_count = batch_word_count.detach().cpu()
            batch_important_word_count = batch_important_word_count.detach().cpu()
            batch_text_length = batch_text_length.detach().cpu()
            output = output.detach().cpu()

    return all_preds


In [None]:
x_test = testdata['text']
x_test = x_test.apply(lambda x: x.replace('\r', ' ').replace('\n', ' '))
VB_test = testdata['VB']
NN_test = testdata['NN']
word_count_test = testdata['word_count']
important_word_count_test = testdata['important_word_count']
text_length_test = testdata['text_length']
test_dataset = TestDataset(x_test, VB_test, NN_test, word_count_test, important_word_count_test, text_length_test, hyperparameters["stopwords"])
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=32)
preds = get_predictions(model, test_dataloader, tokenizer, device)
preds_df = pd.DataFrame(preds, columns=["Prediction"])
preds_df.to_csv("preds.csv", index=False)

Nella leaderboard abbiamo ottenuto un punteggio pari a 81.99.
Siamo consapevoli che tramite ulteriori ricerche sulle Features ed eseguendo una Grid-Search più approfondita, si possano ottenere risultati migliori. Inoltre, con un ulteriore potenza di calcolo sarebbe stato possibile utilizzare modelli più larghi e più performanti migliorando maggiormente i risultati.