In [1]:
import pandas as pd
import numpy as np
import torch
from torch import cuda
from torch.utils.data import Dataset, DataLoader
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, classification_report, f1_score, precision_score, recall_score
from training_code import *
from load_data import initialize_test
from reading_datasets import read_test
from labels_to_ids import labels_to_ids_mal
import time
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [2]:
def main(model_load_location):
    max_len = 256
    batch_size = 32
    grad_step = 1
    learning_rate = 1e-05
    initialization_input = (max_len, batch_size)

    #Reading datasets and initializing data loaders
    dataset_location = '../datasets/task_a/'
    test_data = read_test(dataset_location , split = 'mal_sentiment_dev')

    labels_to_ids = labels_to_ids_mal
    input_data = (test_data, labels_to_ids)

    #Define tokenizer, model and optimizer
    device = 'cuda' if cuda.is_available() else 'cpu' #save the processing time

    tokenizer = AutoTokenizer.from_pretrained(model_load_location)
    model = AutoModelForSequenceClassification.from_pretrained(model_load_location)

    # unshuffled testing data
    optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)
    model.to(device)

    # Getting testing dataloaders
    test_loader = initialize_test(tokenizer, initialization_input, test_data, labels_to_ids, shuffle = False)

    test_ind_f1 = 0
    test_ind_precision = 0
    test_ind_recall = 0

    start = time.time()

    # Run the model with unshuffled testing data
    test_result = testing(model, test_loader, labels_to_ids, device)

    now = time.time()

    print('TIME TO COMPLETE:', (now-start)/60 )
    print()

    return test_result

In [3]:
df = pd.read_table("../datasets/task_a/mal_sentiment_dev.tsv")
df

Unnamed: 0,text,category
0,Mammooka ninghal mass aa pwoli item,Positive
1,Waiting for Malayalam movie For Tamil paiyan,not-malayalam
2,ദളപതി ഫാൻസിന്റെ വക ഒരു ഒന്നൊന്നര വിജയാശംസകൾ...,Positive
3,#Trailer pwolichuuuu ഓണത്തിന് വന്നങ്ങു തകർത്തേ...,Positive
4,Mammoookkaaaa polichadukkiii katta waiting nv 21,Positive
...,...,...
1761,Aa ചിരി uff എന്റെ പൊന്നോ ഇക്ക vere ലെവൽ,Positive
1762,Katta katta katta katta waiting....cant wait man,Positive
1763,Arjun reddy bgm poole thonniyathu enniku mathr...,unknown_state
1764,Fahad ikka ithilum polikum en urappayi,Positive


In [4]:
if __name__ == '__main__':
    
    models = ['bert_base_reg_2','indic_reg_3','multi_reg_3','roberta_reg_0']
    
    results = pd.DataFrame()
    
    results.insert(loc=0, column = 'true_category', value = df['category'])
    
    for model in models: 
        
        test_print_statement = 'Testing ' + model
        print(test_print_statement)

        model_load_location = 'saved_models/malayalam/' + model

        test_result = main(model_load_location)
        results.insert(loc =1, column = model + 'prediction', value = test_result['label'])

    print("Everything successfully completed")


Testing bert_base_reg_2
TESTING DATA
Went through 100 steps
TIME TO COMPLETE: 0.423671547571818

Testing indic_reg_3
TESTING DATA
Went through 100 steps
TIME TO COMPLETE: 0.42467942237854006

Testing multi_reg_3
TESTING DATA
Went through 100 steps
TIME TO COMPLETE: 0.4457781553268433

Testing roberta_reg_0
TESTING DATA
Went through 100 steps
TIME TO COMPLETE: 0.41189691225687664

Everything successfully completed


In [5]:
results

Unnamed: 0,true_category,roberta_reg_0prediction,multi_reg_3prediction,indic_reg_3prediction,bert_base_reg_2prediction
0,Positive,,,,
1,not-malayalam,,,,
2,Positive,,,,
3,Positive,,,,
4,Positive,,,,
...,...,...,...,...,...
1761,Positive,,,,
1762,Positive,,,,
1763,unknown_state,,,,
1764,Positive,,,,


In [6]:
#results.to_csv('test_results_all_models.tsv', sep="\t", index = False)

In [7]:
#df = pd.read_table('test_results_all_models.tsv')

In [8]:
df

Unnamed: 0,text,category
0,Mammooka ninghal mass aa pwoli item,Positive
1,Waiting for Malayalam movie For Tamil paiyan,not-malayalam
2,ദളപതി ഫാൻസിന്റെ വക ഒരു ഒന്നൊന്നര വിജയാശംസകൾ...,Positive
3,#Trailer pwolichuuuu ഓണത്തിന് വന്നങ്ങു തകർത്തേ...,Positive
4,Mammoookkaaaa polichadukkiii katta waiting nv 21,Positive
...,...,...
1761,Aa ചിരി uff എന്റെ പൊന്നോ ഇക്ക vere ലെവൽ,Positive
1762,Katta katta katta katta waiting....cant wait man,Positive
1763,Arjun reddy bgm poole thonniyathu enniku mathr...,unknown_state
1764,Fahad ikka ithilum polikum en urappayi,Positive


In [9]:
df = df.replace(['Lit-News_mentions', 'non-personal_reports','Self_reports'], [0,1,2])
df

Unnamed: 0,text,category
0,Mammooka ninghal mass aa pwoli item,Positive
1,Waiting for Malayalam movie For Tamil paiyan,not-malayalam
2,ദളപതി ഫാൻസിന്റെ വക ഒരു ഒന്നൊന്നര വിജയാശംസകൾ...,Positive
3,#Trailer pwolichuuuu ഓണത്തിന് വന്നങ്ങു തകർത്തേ...,Positive
4,Mammoookkaaaa polichadukkiii katta waiting nv 21,Positive
...,...,...
1761,Aa ചിരി uff എന്റെ പൊന്നോ ഇക്ക vere ലെവൽ,Positive
1762,Katta katta katta katta waiting....cant wait man,Positive
1763,Arjun reddy bgm poole thonniyathu enniku mathr...,unknown_state
1764,Fahad ikka ithilum polikum en urappayi,Positive


In [10]:
df['majority_label'] = df[df.columns[1:]].mode(axis=1)[0].astype(int)

ValueError: invalid literal for int() with base 10: 'Positive'

In [None]:
df

In [None]:
submission = df[['tweet_id', 'majority_label']]

In [None]:
submission = submission.rename(columns={'majority_label': 'label'})
submission

In [None]:
submission = submission.replace([0,1,2], ['Lit-News_mentions', 'non-personal_reports','Self_reports'])

In [None]:
submission

In [None]:
submission.to_csv('Final_submission.tsv', sep="\t", index = False)