In [1]:
import pandas as pd
import numpy as np
import torch
from torch import cuda
from torch.utils.data import Dataset, DataLoader
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, classification_report, f1_score, precision_score, recall_score
from training_code import *
from load_data import initialize_test
from reading_datasets import read_test
from labels_to_ids import task5_labels_to_ids
import time
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [7]:
def main(model_load_location):
    max_len = 256
    batch_size = 32
    grad_step = 1
    learning_rate = 1e-05
    initialization_input = (max_len, batch_size)

    #Reading datasets and initializing data loaders
    dataset_location = '../2022.07.07_task5/'
    test_data = read_test(dataset_location , split = 'test')

    labels_to_ids = task5_labels_to_ids
    input_data = (test_data, labels_to_ids)

    #Define tokenizer, model and optimizer
    device = 'cuda' if cuda.is_available() else 'cpu' #save the processing time

    tokenizer = AutoTokenizer.from_pretrained(model_load_location)
    model = AutoModelForSequenceClassification.from_pretrained(model_load_location)

    # unshuffled testing data
    optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)
    model.to(device)

    # Getting testing dataloaders
    test_loader = initialize_test(tokenizer, initialization_input, test_data, labels_to_ids, shuffle = False)

    test_ind_f1 = 0
    test_ind_precision = 0
    test_ind_recall = 0

    start = time.time()

    # Run the model with unshuffled testing data
    test_result = testing(model, test_loader, labels_to_ids, device)

    now = time.time()

    print('TIME TO COMPLETE:', (now-start)/60 )
    print()

    return test_result

In [9]:
if __name__ == '__main__':
    
    model = 'bert-base-multilingual-uncased'

    test_print_statement = 'Testing ' + model + ' 0'
    print(test_print_statement)

    model_load_location = '../2022.07.07_task5/saved_models/bert-base-multilingual-uncased0'
            
    result_save_location = '../saved_test_result/' + model + '0' + '/'
            
    unformatted_result_save_location = result_save_location + '_unformatted_test_result.tsv'
    formatted_result_save_location = result_save_location + '_formatted_test_result.tsv'

    test_result = main(model_load_location)


    print("\n Testing results")
    print(test_result)
    formatted_test_result = test_result.drop(columns=['text'])

    os.makedirs(result_save_location, exist_ok=True)
    test_result.to_csv(unformatted_result_save_location, sep='\t', index=False)
    formatted_test_result.to_csv(formatted_result_save_location, sep='\t', index=False)

    print("Result files saved")

    print("Everything successfully completed")


Testing bert-base-multilingual-uncased 0
TESTING DATA
Went through 100 steps
Went through 100 steps
Went through 100 steps
TIME TO COMPLETE: 1.772943409283956


 Testing results
     tweet_id                                               text  \
0      110704  #Noticias 1 fallecido por #COVID19  @JaenJunta...   
1      202231  Lo unico q pienso es que con esto del coronavi...   
2      304814  Cachen que hoy estaba leyéndole a mi mamá un t...   
3      108220  #BLOGDeLaFamilia Para arrancar la semana con e...   
4      304520  El que las trabaja!!!Eres muy cara de raja.. T...   
...       ...                                                ...   
6845   202689  Me van a odiar en mi trabajo, soy quien los va...   
6846   110180  Estudian medicamentos senolíticos como alterna...   
6847   109203  Lourdes Vázquez explica a NNGG-LB los efectos ...   
6848   203260  Yo todavía me estoy riendo. Que pasa con @abc_...   
6849   202769  Como me falta la respiración nose si es porque...   

     