In [2]:
import pandas as pd
import numpy as np
import torch
from torch import cuda
from torch.utils.data import Dataset, DataLoader
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, classification_report, f1_score, precision_score, recall_score
from training_code import *
from load_data import initialize_test
from reading_datasets import read_test
from labels_to_ids import task5_labels_to_ids
import time
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [3]:
def main(model_load_location):
    max_len = 256
    batch_size = 32
    grad_step = 1
    learning_rate = 1e-05
    initialization_input = (max_len, batch_size)

    #Reading datasets and initializing data loaders
    dataset_location = '../2022.07.07_task5/'
    test_data = read_test(dataset_location , split = 'test')

    labels_to_ids = task5_labels_to_ids
    input_data = (test_data, labels_to_ids)

    #Define tokenizer, model and optimizer
    device = 'cuda' if cuda.is_available() else 'cpu' #save the processing time

    tokenizer = AutoTokenizer.from_pretrained(model_load_location)
    model = AutoModelForSequenceClassification.from_pretrained(model_load_location)

    # unshuffled testing data
    optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)
    model.to(device)

    # Getting testing dataloaders
    test_loader = initialize_test(tokenizer, initialization_input, test_data, labels_to_ids, shuffle = False)

    test_ind_f1 = 0
    test_ind_precision = 0
    test_ind_recall = 0

    start = time.time()

    # Run the model with unshuffled testing data
    test_result = testing(model, test_loader, labels_to_ids, device)

    now = time.time()

    print('TIME TO COMPLETE:', (now-start)/60 )
    print()

    return test_result

In [5]:
df = pd.read_table("test.tsv")
df

Unnamed: 0,tweet_id,tweet_text
0,110704,#Noticias 1 fallecido por #COVID19 @JaenJunta...
1,202231,Lo unico q pienso es que con esto del coronavi...
2,304814,Cachen que hoy estaba leyéndole a mi mamá un t...
3,108220,#BLOGDeLaFamilia Para arrancar la semana con e...
4,304520,El que las trabaja!!!Eres muy cara de raja.. T...
...,...,...
6845,202689,"Me van a odiar en mi trabajo, soy quien los va..."
6846,110180,Estudian medicamentos senolíticos como alterna...
6847,109203,Lourdes Vázquez explica a NNGG-LB los efectos ...
6848,203260,Yo todavía me estoy riendo. Que pasa con @abc_...


In [9]:
if __name__ == '__main__':
    
    models = ['bert-base-multilingual-uncased0','bert-base-multilingual-uncased_oversampled3', 
              'bert-base-multilingual-uncased-extremepositive0','bert-base-multilingual-uncased-overandunder3', 
              'bert-base-multilingual-cased4', 'bert-base-multilingual-cased-overandunder1','bert-base-spanish-wwm-uncased4', 
              'bert-base-spanish-wwm-cased4', 'bert-base-spanish-wwm-cased-overandunder2', 'xlm-roberta-base3']
    
    results = pd.DataFrame()
    
    results.insert(loc=0, column = 'tweet_id', value = df['tweet_id'])
    
    for model in models: 
        
        test_print_statement = 'Testing ' + model
        print(test_print_statement)

        model_load_location = '../2022.07.07_task5/saved_models/' + model

        test_result = main(model_load_location)
        results.insert(loc =1, column = model + 'prediction', value = test_result['label'])

    print("Everything successfully completed")


Testing bert-base-multilingual-uncased0
TESTING DATA
Went through 100 steps
Went through 100 steps
Went through 100 steps
TIME TO COMPLETE: 1.872516926129659

Testing bert-base-multilingual-uncased_oversampled3
TESTING DATA
Went through 100 steps
Went through 100 steps
Went through 100 steps
TIME TO COMPLETE: 1.9630619406700134

Testing bert-base-multilingual-uncased-extremepositive0
TESTING DATA
Went through 100 steps
Went through 100 steps
Went through 100 steps
TIME TO COMPLETE: 1.9576611598332723

Testing bert-base-multilingual-uncased-overandunder3
TESTING DATA
Went through 100 steps
Went through 100 steps
Went through 100 steps
TIME TO COMPLETE: 1.9573943614959717

Testing bert-base-multilingual-cased4
TESTING DATA
Went through 100 steps
Went through 100 steps
Went through 100 steps
TIME TO COMPLETE: 1.9566962798436482

Testing bert-base-multilingual-cased-overandunder1
TESTING DATA
Went through 100 steps
Went through 100 steps
Went through 100 steps
TIME TO COMPLETE: 1.956835512

In [10]:
results

Unnamed: 0,tweet_id,xlm-roberta-base3prediction,bert-base-spanish-wwm-cased-overandunder2prediction,bert-base-spanish-wwm-cased4prediction,bert-base-spanish-wwm-uncased4prediction,bert-base-multilingual-cased-overandunder1prediction,bert-base-multilingual-cased4prediction,bert-base-multilingual-uncased-overandunder3prediction,bert-base-multilingual-uncased-extremepositive0prediction,bert-base-multilingual-uncased_oversampled3prediction,bert-base-multilingual-uncased0prediction
0,110704,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions
1,202231,Self_reports,Self_reports,Self_reports,Self_reports,Self_reports,Self_reports,Self_reports,Self_reports,Self_reports,Self_reports
2,304814,non-personal_reports,non-personal_reports,non-personal_reports,non-personal_reports,non-personal_reports,non-personal_reports,non-personal_reports,non-personal_reports,non-personal_reports,non-personal_reports
3,108220,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions
4,304520,non-personal_reports,non-personal_reports,non-personal_reports,non-personal_reports,non-personal_reports,non-personal_reports,non-personal_reports,non-personal_reports,non-personal_reports,non-personal_reports
...,...,...,...,...,...,...,...,...,...,...,...
6845,202689,Self_reports,Self_reports,Self_reports,Self_reports,non-personal_reports,Self_reports,Self_reports,Self_reports,Self_reports,Self_reports
6846,110180,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions
6847,109203,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions
6848,203260,Self_reports,Self_reports,Self_reports,Self_reports,Self_reports,Self_reports,Self_reports,Self_reports,Self_reports,Self_reports


In [11]:
results.to_csv('test_results_all_models.tsv', sep="\t", index = False)

In [12]:
df = pd.read_table('test_results_all_models.tsv')

In [13]:
df

Unnamed: 0,tweet_id,xlm-roberta-base3prediction,bert-base-spanish-wwm-cased-overandunder2prediction,bert-base-spanish-wwm-cased4prediction,bert-base-spanish-wwm-uncased4prediction,bert-base-multilingual-cased-overandunder1prediction,bert-base-multilingual-cased4prediction,bert-base-multilingual-uncased-overandunder3prediction,bert-base-multilingual-uncased-extremepositive0prediction,bert-base-multilingual-uncased_oversampled3prediction,bert-base-multilingual-uncased0prediction
0,110704,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions
1,202231,Self_reports,Self_reports,Self_reports,Self_reports,Self_reports,Self_reports,Self_reports,Self_reports,Self_reports,Self_reports
2,304814,non-personal_reports,non-personal_reports,non-personal_reports,non-personal_reports,non-personal_reports,non-personal_reports,non-personal_reports,non-personal_reports,non-personal_reports,non-personal_reports
3,108220,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions
4,304520,non-personal_reports,non-personal_reports,non-personal_reports,non-personal_reports,non-personal_reports,non-personal_reports,non-personal_reports,non-personal_reports,non-personal_reports,non-personal_reports
...,...,...,...,...,...,...,...,...,...,...,...
6845,202689,Self_reports,Self_reports,Self_reports,Self_reports,non-personal_reports,Self_reports,Self_reports,Self_reports,Self_reports,Self_reports
6846,110180,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions
6847,109203,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions,Lit-News_mentions
6848,203260,Self_reports,Self_reports,Self_reports,Self_reports,Self_reports,Self_reports,Self_reports,Self_reports,Self_reports,Self_reports


In [14]:
df = df.replace(['Lit-News_mentions', 'non-personal_reports','Self_reports'], [0,1,2])
df

Unnamed: 0,tweet_id,xlm-roberta-base3prediction,bert-base-spanish-wwm-cased-overandunder2prediction,bert-base-spanish-wwm-cased4prediction,bert-base-spanish-wwm-uncased4prediction,bert-base-multilingual-cased-overandunder1prediction,bert-base-multilingual-cased4prediction,bert-base-multilingual-uncased-overandunder3prediction,bert-base-multilingual-uncased-extremepositive0prediction,bert-base-multilingual-uncased_oversampled3prediction,bert-base-multilingual-uncased0prediction
0,110704,0,0,0,0,0,0,0,0,0,0
1,202231,2,2,2,2,2,2,2,2,2,2
2,304814,1,1,1,1,1,1,1,1,1,1
3,108220,0,0,0,0,0,0,0,0,0,0
4,304520,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...
6845,202689,2,2,2,2,1,2,2,2,2,2
6846,110180,0,0,0,0,0,0,0,0,0,0
6847,109203,0,0,0,0,0,0,0,0,0,0
6848,203260,2,2,2,2,2,2,2,2,2,2


In [18]:
df['majority_label'] = df[df.columns[1:]].mode(axis=1)[0].astype(int)

In [19]:
df

Unnamed: 0,tweet_id,xlm-roberta-base3prediction,bert-base-spanish-wwm-cased-overandunder2prediction,bert-base-spanish-wwm-cased4prediction,bert-base-spanish-wwm-uncased4prediction,bert-base-multilingual-cased-overandunder1prediction,bert-base-multilingual-cased4prediction,bert-base-multilingual-uncased-overandunder3prediction,bert-base-multilingual-uncased-extremepositive0prediction,bert-base-multilingual-uncased_oversampled3prediction,bert-base-multilingual-uncased0prediction,majority_label
0,110704,0,0,0,0,0,0,0,0,0,0,0
1,202231,2,2,2,2,2,2,2,2,2,2,2
2,304814,1,1,1,1,1,1,1,1,1,1,1
3,108220,0,0,0,0,0,0,0,0,0,0,0
4,304520,1,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
6845,202689,2,2,2,2,1,2,2,2,2,2,2
6846,110180,0,0,0,0,0,0,0,0,0,0,0
6847,109203,0,0,0,0,0,0,0,0,0,0,0
6848,203260,2,2,2,2,2,2,2,2,2,2,2


In [24]:
submission = df[['tweet_id', 'majority_label']]

In [28]:
submission = submission.rename(columns={'majority_label': 'label'})
submission

Unnamed: 0,tweet_id,label
0,110704,0
1,202231,2
2,304814,1
3,108220,0
4,304520,1
...,...,...
6845,202689,2
6846,110180,0
6847,109203,0
6848,203260,2


In [30]:
submission = submission.replace([0,1,2], ['Lit-News_mentions', 'non-personal_reports','Self_reports'])

In [31]:
submission

Unnamed: 0,tweet_id,label
0,110704,Lit-News_mentions
1,202231,Self_reports
2,304814,non-personal_reports
3,108220,Lit-News_mentions
4,304520,non-personal_reports
...,...,...
6845,202689,Self_reports
6846,110180,Lit-News_mentions
6847,109203,Lit-News_mentions
6848,203260,Self_reports


In [32]:
submission.to_csv('Final_submission.tsv', sep="\t", index = False)