In [4]:
from emoji import demojize
from nltk.tokenize import TweetTokenizer


tokenizer = TweetTokenizer()


def normalizeToken(token):
    lowercased_token = token.lower()
    if token.startswith("@"):
        return "@USER"
    elif lowercased_token.startswith("http") or lowercased_token.startswith("www"):
        return "HTTPURL"
    elif len(token) == 1:
        return demojize(token)
    else:
        if token == "’":
            return "'"
        elif token == "…":
            return "..."
        else:
            return token


def normalizeTweet(tweet):
    tokens = tokenizer.tokenize(tweet.replace("’", "'").replace("…", "..."))
    normTweet = " ".join([normalizeToken(token) for token in tokens])

    normTweet = (
        normTweet.replace("cannot ", "can not ")
        .replace("n't ", " n't ")
        .replace("n 't ", " n't ")
        .replace("ca n't", "can't")
        .replace("ai n't", "ain't")
    )
    normTweet = (
        normTweet.replace("'m ", " 'm ")
        .replace("'re ", " 're ")
        .replace("'s ", " 's ")
        .replace("'ll ", " 'll ")
        .replace("'d ", " 'd ")
        .replace("'ve ", " 've ")
    )
    normTweet = (
        normTweet.replace(" p . m .", "  p.m.")
        .replace(" p . m ", " p.m ")
        .replace(" a . m .", " a.m.")
        .replace(" a . m ", " a.m ")
    )

    return " ".join(normTweet.split())


if __name__ == "__main__":
    print(
        normalizeTweet(
            "SC has first two presumptive cases of coronavirus, DHEC confirms https://postandcourier.com/health/covid19/sc-has-first-two-presumptive-cases-of-coronavirus-dhec-confirms/article_bddfe4ae-5fd3-11ea-9ce4-5f495366cee6.html?utm_medium=social&utm_source=twitter&utm_campaign=user-share… via @postandcourier"
        )
    )


SC has first two presumptive cases of coronavirus , DHEC confirms HTTPURL ... via @USER


In [3]:
import csv
import pandas as pd

In [2]:
# import os
import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import BertForSequenceClassification
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.metrics import f1_score
#from TweetNormalizer import normalizeTweet
device = torch.device("cuda")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# data_dir = 'data/subtask-2-english/'
# train_file = data_dir + 'train_en.tsv'
# dev_file =  data_dir + 'dev_en.tsv'
# test_file = data_dir + 'test_en.tsv'

In [43]:
data_dir = ''
train_file = data_dir + 'translated_train_ml.tsv'
dev_file =  data_dir + 'translated_dev_ml.tsv'
test_file = data_dir + 'translated_test_ml.tsv'

In [5]:
class CustomTsvDataset(Dataset):
    def __init__(self, tsv_file, tokenizer, text_prefunc=None):
        self.data = pd.read_csv(tsv_file, sep='\t', quoting=csv.QUOTE_NONE)
        self.tokenizer = tokenizer
        self.text_transform = text_prefunc.preprocess if text_prefunc is not None else normalizeTweet

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]

        label = 1 if row.get('label', 0) == 'SUBJ' else 0
        label = torch.tensor([1 - label, 0, label], dtype=torch.float32)  # Convert label to one-hot encoded tensor
        # weight = torch.tensor([1.2 if row.get('solved_conflict', 0) == 'True' else 1])
        sentence = self.text_transform(row['sentence'])
        text_tokens = self.tokenizer(sentence, padding="max_length", truncation=True, max_length=128)
        input_ids = text_tokens["input_ids"]
        return torch.tensor(input_ids), label, row['sentence_id']



In [None]:

# data = pd.read_csv('data/subtask-2-english/train_en.tsv', sep='\t')

# data_filtered = data[data['label'].isin(['SUBJ', 'OBJ'])]

# num_subj_obj = (data_filtered['label'] == 'SUBJ').sum(), (data_filtered['label'] == 'OBJ').sum()
# print(f"Number of sentences with both SUBJ & OBJ labels: {num_subj_obj}")

# label_counts = data_filtered.groupby(['label', 'solved_conflict'])['sentence_id'].count().unstack(fill_value=0)
# print("\nSolved conflict counts for each label:")
# print(label_counts.to_string())


In [None]:
# model_name = 'bert-base-uncased'
# tokenizer = BertTokenizer.from_pretrained(model_name)
# model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

In [None]:
# model_name = "finetuned-sentiment-analysis-model"
# tokenizer = AutoTokenizer.from_pretrained("federicopascual/finetuned-sentiment-analysis-model")
# model = AutoModelForSequenceClassification.from_pretrained("federicopascual/finetuned-sentiment-analysis-model", num_labels=2)

In [7]:
model_name = "Sentiment-Analysis-BERT"
tokenizer = AutoTokenizer.from_pretrained("MarieAngeA13/Sentiment-Analysis-BERT")
model = AutoModelForSequenceClassification.from_pretrained("MarieAngeA13/Sentiment-Analysis-BERT")
model.load_state_dict(torch.load(f"weights/{model_name.replace('/','_')}_model_weights_ml.pth"))

<All keys matched successfully>

In [None]:
# model_name = "bertweet-sentiment-analysis"
# tokenizer = AutoTokenizer.from_pretrained("finiteautomata/bertweet-base-sentiment-analysis")
# model = AutoModelForSequenceClassification.from_pretrained("finiteautomata/bertweet-base-sentiment-analysis")

In [48]:
batch_size = 16
train_dataset = CustomTsvDataset(train_file, tokenizer)
dev_dataset = CustomTsvDataset(dev_file, tokenizer)
train_loader = DataLoader(train_dataset, batch_size, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size)

In [54]:
def run_baselines(train_loader, dev_loader, num_epochs=10):
    best_f1 = 0.8080
    best_model_weights = None  

    # Training loop
    for epoch in range(num_epochs):
        pbar = tqdm.tqdm(train_loader)
        pbar.set_description("Training Epoch_{}".format(epoch))
        model.train()
        for inputs, labels, ids in pbar:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            # print(outputs.logits.size(), labels.size())
            loss = criterion(outputs.logits.squeeze(), labels.squeeze())
            # loss = (loss*weights).mean()
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # Validation loop
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        predicted_labels = []
        true_labels = []

        with torch.no_grad():
            for inputs, labels, ids in dev_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                val_loss += criterion(outputs.logits.squeeze(), labels.squeeze()).item()
                predicted_probs = torch.sigmoid(outputs.logits)  # Apply sigmoid to get probabilities
                predicted = torch.argmax(predicted_probs, dim=1)  # Get index of maximum probability
                total += labels.size(0)
                correct += (predicted == torch.argmax(labels, dim=1)).sum().item()  # Compare indices for accuracy
                predicted_labels.extend(predicted.cpu().numpy())
                true_labels.extend(torch.argmax(labels, dim=1).cpu().numpy())

        val_loss /= len(dev_loader)
        val_accuracy = correct / total
        f1 = f1_score(true_labels, predicted_labels, average='macro')  # Calculate macro F1 score

        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {val_loss:.4f}, Accuracy: {val_accuracy:.4f}, Macro F1 Score: {f1:.4f}')

        # Check if the current F1 score is the best seen so far
        if f1 > best_f1:
            best_f1 = f1
            best_model_weights = model.state_dict()
            torch.save(best_model_weights, f"weights/{model_name.replace('/','_')}_model_weights_ml.pth")
            print('Best model weights saved.')


In [50]:
model.to(device)
num_epochs = 20
learning_rate = 2e-5
# label_smoothing = 0.2
# criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([1 - label_smoothing, label_smoothing])).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)


In [55]:
run_baselines(train_loader, dev_loader, num_epochs)

Training Epoch_0: 100%|██████████| 323/323 [00:27<00:00, 11.85it/s]


Epoch 1/20, Loss: 1.0164, Accuracy: 0.7780, Macro F1 Score: 0.7779


Training Epoch_1: 100%|██████████| 323/323 [00:27<00:00, 11.71it/s]


Epoch 2/20, Loss: 1.4783, Accuracy: 0.7700, Macro F1 Score: 0.7695


Training Epoch_2: 100%|██████████| 323/323 [00:27<00:00, 11.68it/s]


Epoch 3/20, Loss: 1.3421, Accuracy: 0.7640, Macro F1 Score: 0.7630


Training Epoch_3: 100%|██████████| 323/323 [00:27<00:00, 11.69it/s]


Epoch 4/20, Loss: 2.0554, Accuracy: 0.7220, Macro F1 Score: 0.7125


Training Epoch_4: 100%|██████████| 323/323 [00:27<00:00, 11.70it/s]


Epoch 5/20, Loss: 1.4300, Accuracy: 0.7700, Macro F1 Score: 0.7698


Training Epoch_5: 100%|██████████| 323/323 [00:27<00:00, 11.72it/s]


Epoch 6/20, Loss: 1.2294, Accuracy: 0.7620, Macro F1 Score: 0.7608


Training Epoch_6: 100%|██████████| 323/323 [00:27<00:00, 11.71it/s]


Epoch 7/20, Loss: 1.6283, Accuracy: 0.7520, Macro F1 Score: 0.7479


Training Epoch_7: 100%|██████████| 323/323 [00:27<00:00, 11.72it/s]


Epoch 8/20, Loss: 1.6304, Accuracy: 0.7840, Macro F1 Score: 0.7838


Training Epoch_8: 100%|██████████| 323/323 [00:27<00:00, 11.73it/s]


Epoch 9/20, Loss: 1.8024, Accuracy: 0.7660, Macro F1 Score: 0.7648


Training Epoch_9: 100%|██████████| 323/323 [00:27<00:00, 11.73it/s]


Epoch 10/20, Loss: 1.8473, Accuracy: 0.7680, Macro F1 Score: 0.7672


Training Epoch_10: 100%|██████████| 323/323 [00:27<00:00, 11.72it/s]


Epoch 11/20, Loss: 1.9441, Accuracy: 0.7740, Macro F1 Score: 0.7732


Training Epoch_11: 100%|██████████| 323/323 [00:27<00:00, 11.72it/s]


Epoch 12/20, Loss: 1.4736, Accuracy: 0.7620, Macro F1 Score: 0.7595


Training Epoch_12: 100%|██████████| 323/323 [00:27<00:00, 11.73it/s]


Epoch 13/20, Loss: 1.2704, Accuracy: 0.7880, Macro F1 Score: 0.7869


Training Epoch_13: 100%|██████████| 323/323 [00:27<00:00, 11.72it/s]


Epoch 14/20, Loss: 1.3373, Accuracy: 0.7820, Macro F1 Score: 0.7819


Training Epoch_14: 100%|██████████| 323/323 [00:27<00:00, 11.72it/s]


Epoch 15/20, Loss: 1.5392, Accuracy: 0.7600, Macro F1 Score: 0.7586


Training Epoch_15: 100%|██████████| 323/323 [00:27<00:00, 11.64it/s]


Epoch 16/20, Loss: 1.7495, Accuracy: 0.7560, Macro F1 Score: 0.7543


Training Epoch_16: 100%|██████████| 323/323 [00:27<00:00, 11.71it/s]


Epoch 17/20, Loss: 1.3566, Accuracy: 0.7560, Macro F1 Score: 0.7544


Training Epoch_17: 100%|██████████| 323/323 [00:27<00:00, 11.72it/s]


Epoch 18/20, Loss: 1.4619, Accuracy: 0.7720, Macro F1 Score: 0.7716


Training Epoch_18: 100%|██████████| 323/323 [00:27<00:00, 11.72it/s]


Epoch 19/20, Loss: 1.5165, Accuracy: 0.7460, Macro F1 Score: 0.7433


Training Epoch_19: 100%|██████████| 323/323 [00:27<00:00, 11.72it/s]


Epoch 20/20, Loss: 1.6828, Accuracy: 0.7580, Macro F1 Score: 0.7576


In [9]:
# model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)
# model.load_state_dict(torch.load(f"weights/{model_name.replace('/','_')}_model_weights.pth"))
model.to(device)
model.eval()
test_file = "translated_test_ml.tsv"
test_dataset = CustomTsvDataset(test_file, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=32)


In [11]:

test_predicted_labels = []
list_label = [ "OBJ", "OBJ", "SUBJ" ]
with open("subtask2A_multilingual.tsv", "w", encoding ='utf-8') as results_file:
	results_file.write("sentence_id\tlabel\n")
	with torch.no_grad():
		for inputs, labels, tweet_ids in test_loader:
			inputs, labels = inputs.to(device), labels.to(device)
			outputs = model(inputs)
			predicted_probs = torch.sigmoid(outputs.logits) 
			predicted = torch.argmax(predicted_probs, dim=1) 
			test_predicted_labels.extend(predicted.cpu().numpy())
			for i, line in enumerate(tweet_ids):
				label = list_label[predicted[i]]
				results_file.write("{}\t{}\n".format(line, label))
			

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


In [None]:
from deep_translator import GoogleTranslator
translator = GoogleTranslator(target="en") 

In [31]:
class CustomTsvDatasetTest(Dataset):
    def __init__(self, tsv_file):
        self.data = pd.read_csv(tsv_file, sep='\t', quoting=csv.QUOTE_NONE)
   
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        return row['sentence_id'], row["sentence_translation"], row["label"]



In [37]:
# translated_sentences = dict()
ids = []
sentences  =[]
labels = []

# id_to_label = dict()
	
for batch_id, batch_sentence, batch_label in test_loader:
	# assert(len(batch_id) == len(batch_sentence))
	for i in range(len(batch_id)):
		id = batch_id[i]
		sentence = batch_sentence[i]
		label = id_to_label[id]
		print(id)
		# translated_sentence = translator.translate(sentence)
		ids.append(id)
		sentences.append(sentence)
		labels.append(label)



MIS_702-curl_03_010
MIS_560-curl_04_005
MIS_2265-eurl_03_022
FAT_1139-eurl_01_023
MIS_427-curl_06_004
MIS_973-curl_02_005
MIS_2255-eurl_02_002
MIS_432-curl_02_007
MIS_2255-eurl_02_014
MIS_19-curl_03_005
AFP_903-curl_04_001
MIS_1049-curl_03_010
MIS_230-curl_05_011
MIS_1532-eurl_01_045
MIS_560-curl_04_015
MIS_1503-curl_03_003
MIS_1-curl_06_003
MIS_22-curl_01_012
MIS_2941-eurl_02_027
AFP_696-eurl_03_019
FAT_290-eurl_02_039
MIS_702-curl_06_010
AFP_29-eurl_02_019
MIS_1486-eurl_02_009
AFP_345-eurl_01_015
AFP_674-eurl_03_011
MIS_239-curl_05_005
AFP_584-eurl_02_022
MIS_2120-curl_04_003
MIS_239-curl_04_034
AFP_887-eurl_01_019
AFP_614-eurl_08_014
MIS_561-curl_02_013
AFP_637-eurl_01_007
AFP_425-eurl_04_011
MIS_1280-curl_01_002
AFP_887-eurl_01_011
FAT_1118-eurl_02_003
MIS_1585-eurl_02_015
MIS_6-eurl_03_003
MIS_774-curl_02_006
AFP_587-curl_03_010
AFP_588-eurl_01_002
AFP_696-eurl_03_023
MIS_702-curl_06_006
MIS_702-curl_02_015
MIS_2120-curl_04_013
FAT_1062-eurl_02_001
FAT_1352-curl_03_001
MIS_2585-eu

In [38]:
print(len(sentences))

5159


In [39]:
# ids = []
# sentences = []
# for id in translated_sentences:
# 	ids.append(id)
# 	sentences.append(translated_sentences[id])

df = pd.DataFrame({"sentence_id": ids, "sentence": sentences, "label": labels})
print(len(df))
df.to_csv("translated_train_ml_new.tsv", sep="\t", index=False) 


5159


model
epoch
lbel_smoothing
learning_rate
data_augment
attention_mask
textblob
resolved_conflict	

In [32]:
test_file = "translated_train_ml.tsv"
test_dataset = CustomTsvDatasetTest(test_file)
test_loader = DataLoader(test_dataset, batch_size=32)


In [33]:
print(len(test_loader.dataset))

5159
