In [1]:
import os
import random
import numpy as np
import torch
import transformers

def set_all_seeds(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed = 260615
set_all_seeds(seed)

print("The global seed " + str(seed))

The global seed 260615


## Hyperparameters

In [2]:
# LANGUAGE AND DATASET

_LANGUAGE_         = 'en'
_DATASET_          = '2020'

In [3]:
# MODEL CLASSIFICATION

_PRETRAINED_LM_    = 'vinai/bertweet-base'
_PREPROCESS_TEXT_  = True
_TWEET_BATCH_SIZE_ = 5
_ADAPTER_CONFIG_   = transformers.ParallelConfig(reduction_factor = 256)
_MAX_SEQ_LEN_      = 150

In [4]:
# TRAIN

_OUTPUT_DIR_       = 'checkPointsNLIES'
_LOGGING_STEPS_    = 50
_NUM_AUTHORS_      = 256
_K_FOLD_CV_        = 5
_NO_GPUS_          = 1
_BATCH_SIZE_       = int(8 / _NO_GPUS_)
_EPOCHS_           = 10
_LEARNING_RATE_    = 1e-8

# PREDICTIONS

_DATASET_          = 'PAN17_NLI'
_PRED_DIR_         = 'NLI_5tweet'

## Other parameters

In [5]:
# LABEL DICTONARIES -----------------------------------------------------------------------

# 2017

gender_dict    = {'female': 0, 'male':   1}
varietyEN_dict = {'australia': 0, 'canada': 1, 'great britain': 2, 'ireland': 3, 'new zealand': 4, 'united states': 5}
varietyES_dict = {'argentina': 0, 'chile': 1, 'colombia': 2, 'mexico': 3, 'peru': 4, 'spain': 5, 'venezuela': 6}  

genderEN_hip  = {0: 'I’m a female', 1: 'I’m a male'}
genderES_hip  = {0: 'Mi nombre es María', 1: 'Mi nombre es José'}

# 2020 



In [6]:
# SET LANGUAGE DICTIONARIES

if _LANGUAGE_ == 'en':
    gender_hip   = genderEN_hip
    variety_dict = varietyEN_dict

elif _LANGUAGE_ == 'es':
    gender_hip   = genderES_hip
    variety_dict = varietyES_dict

In [7]:
# SET LANGUAGE TOKENIZER

from transformers import AutoTokenizer, PretrainedConfig

tokenizer = AutoTokenizer.from_pretrained(_PRETRAINED_LM_)
vocab = tokenizer.get_vocab()

config             = PretrainedConfig.from_pretrained(_PRETRAINED_LM_)
nli_label2id       = config.label2id
is_encoder_decoder = config.is_encoder_decoder

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
You are using a model of type roberta to instantiate a model of type . This is not supported for all configurations of models and can yield errors.


## Datasets

In [None]:
import os
import xml.etree.ElementTree as ET
from random import shuffle, sample
from pysentimiento.preprocessing import preprocess_tweet
from tools.TweetNormalizer import normalizeTweet
import torch
from torch.utils.data import Dataset


class BasePAN():
    
    def __init__(self, Dir, split, language, label_idx, label_dict):
        self.Dir          = Dir
        self.split        = split
        self.language     = language
        self.tokenizer    = tokenizer
        self.label_dict   = label_dict
        self.tw_bsz       = tweet_batch_size
        
        
        self.authors   = self.get_authors(Dir, split, language)
        self.author_lb = self.get_author_labels(Dir, split, language)
        
        self.author_ids = {}
        for i in range(len(self.authors)):
            self.author_ids[ self.authors[i] ] = i
        
        
        # Save authors splited by gender ----------------------------------
        
        # create empty dictionary of authors per label
        
        self.splited_authors = {}
        for i in gender_dict.values():
            self.splited_authors[ i ] = []
        
        # fill dictionary 
        
        for author in self.authors:
            gl = self.author_lb[author]['gender']
            self.splited_authors[ gl ].append(author)
            
        # shuffle authors
        
        for i in gender_dict.values():
            shuffle(self.splited_authors[i])
        
        #----------------------------------------------------------------

    
    def get_authors(self, Dir, split, language):
        path    = os.path.join(Dir, split, language)
        files   = os.listdir(path)
        authors = [ file[0:-4] for file in files ] 
        
        return authors
    
    
    def get_author_labels(self, Dir, split, language):
        lb_file_name = os.path.join(Dir, split, language + '.txt')
        lb_file      = open(lb_file_name, "r")
        author_lb    = dict()

        for line in lb_file:
            author, gender, variety = line.split(':::')
            variety = variety[:-1]                       

            gl = self.gender_dict[gender]
            vl = self.variety_dict[variety]

            author_lb[author] = {'gender': gl, 'variety': vl}

        lb_file.close()
        
        return author_lb
    
    
    def get_tweets_in_batches(self, Dir, split, language):
        data   = []

        for author in self.authors:
            tw_file_name = os.path.join(Dir, split, language, author + '.xml')
            tree         = ET.parse(tw_file_name)
            root         = tree.getroot()
            documents    = root[0]
            total_tweets = len(documents)

            for i in range(0, total_tweets, self.tw_bsz):
                doc_batch = documents[i : i + self.tw_bsz]
                tweets    = ''

                for document in doc_batch:
                    tweets += document.text + '\n'

                data.append( {'author': author, 'text': tweets, **self.author_lb[author]} )
        
        return data
    
    
    def get_tweets_in_batches_NLI(self, Dir, split, language):
        data   = []

        for author in self.authors:
            tw_file_name = os.path.join(Dir, split, language, author + '.xml')
            tree         = ET.parse(tw_file_name)
            root         = tree.getroot()
            documents    = root[0]
            total_tweets = len(documents)

            for i in range(0, total_tweets, self.tw_bsz):
                doc_batch = documents[i : i + self.tw_bsz]
                tweets    = ''

                for document in doc_batch:
                    tweets += document.text + '\n'

                data.append( {'author': author, 'text': tweets, **self.author_lb[author]} )
        
        return data
    
    
    def get_all_data():
        
        print("\nReading data...")
        
        self.data = self.get_tweets_in_batches(Dir, split, language)
        
        shuffle(self.data)
        
        if preprocess_text:
            print("    Done\nPreprocessing text...")
            
            if self.language == 'es':
                preprocessed   = [preprocess_tweet(instance['text']) for instance in self.data]
            elif self.language == 'en':
                preprocessed   = [normalizeTweet(instance['text'])   for instance in self.data]
            
        else:
            preprocessed   = [instance['text'] for instance in self.data]
        
        print("    Done\nTokenizing...")
        
        self.encodings = self.tokenizer(preprocessed, max_length = max_seq_len, 
                                                      truncation = True, 
                                                      padding    = True,
                                                      return_tensors = 'pt')
         
        print("    Done\nMerging data...")
        
        for i in range(len(self.data)):
            self.data[i].update( {key: self.encodings[key][i] for key in self.encodings.keys()} )
        
        print("    Done\n\nTotal Instances: " + str(len(self.data)) + '\n')
    
    
    def cross_val(self, k, val_idx, num_authors):
        
        if k > 1:
            sz     = int(len(self.authors) / len(self.gender_dict))
            val_sz = int(sz / k)
        if k == 1:
            sz     = int(len(self.authors) / len(self.gender_dict))
            val_sz = 0
        
        splited_train = {}
        splited_val   = {}
        
        for i in self.gender_dict.values():
            splited_train[i] = self.splited_authors[i][0:( val_sz*val_idx )] + self.splited_authors[i][( val_sz*(val_idx+1) ):sz]
            splited_val[i]   = self.splited_authors[i][( val_sz*val_idx ):( val_sz*(val_idx+1) )]
        
        authors_train = []
        authors_val   = []
        
        for i in self.gender_dict.values():
            authors_train += sample(splited_train[i], num_authors)
            authors_val   += splited_val[i]
        
        
        return authors_train, authors_val    

In [8]:
baseTrain  = BasePAN17(Dir             = 'data/' + _DATASET_,
                      split            = 'train',
                      language         = _LANGUAGE_,
                      tokenizer        = tokenizer,
                      gender_dict      = gender_dict,
                      variety_dict     = variety_dict,
                      tweet_batch_size = _TWEET_BATCH_SIZE_,
                      max_seq_len      = _MAX_SEQ_LEN_,
                      preprocess_text  = _PREPROCESS_TEXT_)


Reading data...
    Done
Preprocessing text...
    Done
Tokenizing...
    Done
Merging data...
    Done

Total Instances: 419998


Reading data...
    Done
Preprocessing text...
    Done
Tokenizing...
    Done
Merging data...
    Done

Total Instances: 280000

