In [1]:
!pwd

/home/azaelcarrillo/Documents/AutoML_in_data_augmentation/taks/PolitiES/model


In [1]:
seed = 0
targets = ['gender', 'profession','ideology_binary', 'ideology_multiclass']

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import copy

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


from sentence_transformers import SentenceTransformer
from datasets import Dataset
from datasets import load_dataset

from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer
from pysentimiento.preprocessing import preprocess_tweet


from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import f1_score

def f1s(preds, dtrain):
    f1 = f1_score(dtrain.get_label(), np.round(preds), average='macro')
    return 'f1', f1

def get_train_test(data_train, reduce=False):
    users_train, users_test = train_test_split(data_train.index.unique(), test_size=.2, random_state=19970808)
    
    if reduce:
        data_train['idd'] = np.array(range(data_train.shape[0])) % reduce
        df_tweets = data_train.reset_index().groupby(by=['label', 'idd']).tweet.apply(lambda x: ' '.join(x))
        df = data_train.reset_index().groupby(by=['label', 'idd']).first()
        
        
        data_test['idd'] = np.array(range(data_test.shape[0])) % reduce
        df_test = data_test.reset_index().groupby(by=['label', 'idd']).tweet.apply(lambda x: ' '.join(x))
        
    else:
        df = data_train
        df_tweets = data_train.tweet
    
    df_targets = df[targets].apply(LabelEncoder().fit_transform)
    
    X_train = df_tweets.loc[users_train]
    Y_train = df_targets.loc[users_train]
    Y_train = Y_train.droplevel(1)

    X_test = df_tweets.loc[users_test]
    Y_test = df_targets.loc[users_test]
    Y_test = Y_test.droplevel(1)
    
    return X_train, X_test, df_test, Y_train, Y_test

# Bert utils
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)

def fine_tune_beto(X_train, X_test, y_train, y_test, iters):
    dataset_train = Dataset.from_pandas(pd.DataFrame({'text': X_train.values, 'labels': y_train.values})).shuffle()
    dataset_test = Dataset.from_pandas(pd.DataFrame({'text': X_test.values, 'labels': y_test.values})).shuffle()

    dataset_train_tok = dataset_train.map(preprocess_function, batched=True)
    dataset_test_tok = dataset_test.map(preprocess_function, batched=True)

    training_args = TrainingArguments(
        output_dir="./results",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=iters,
        weight_decay=0.01, 
    )

    trainer = Trainer(
        model=bert_model,
        args=training_args,
        train_dataset=dataset_train_tok,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    old_collator = trainer.data_collator
    trainer.data_collator = lambda data: dict(old_collator(data))

    trainer.train()
    
def get_embeddings(X):
    X_emb = []
    for txt in X:
        encoding = tokenizer.encode_plus(txt, 
                                         add_special_tokens=True, 
                                         truncation=True, 
                                         padding = "max_length", 
                                         return_attention_mask=True, 
                                         return_tensors="pt")

        encoding = encoding.to(device)

        output = bert_model.base_model(**encoding)
        embeddings_tensor = output.pooler_output
        embeddings = embeddings_tensor.tolist()[0]
        X_emb.append(embeddings)
    X_emb = np.array(X_emb)
    
    return X_emb

def get_predict(X):
    X_emb = []
    for txt in X:
        encoding = tokenizer.encode_plus(txt, 
                                         add_special_tokens=True, 
                                         truncation=True, 
                                         padding = "max_length", 
                                         return_attention_mask=True, 
                                         return_tensors="pt")

        encoding = encoding.to(device)

        output = bert_model(**encoding)
        pre = torch.argmax(output.logits).tolist()
        
        X_emb.append(pre)
    X_emb = np.array(X_emb)
    
    return X_emb


def create_submission(users, gender, prof, ideobi, ideomul):
    header = ['user','gender','profession','ideology_binary','ideology_multiclass']
    

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
data_pre = pd.read_csv("../data/raw/tweets_complete.csv")
data_pre.set_index('label', inplace=True)

data_train = pd.read_csv("../data/raw/training.csv")
data_dev = pd.read_csv('../data/raw/development.csv')
data_dev_test = pd.read_csv('../data/raw/development_test.csv')
data_test = pd.read_csv("../data/raw/test_without_labels.csv")

data_test.drop(columns='Unnamed: 0', inplace=True)
data_test.set_index('label', inplace=True)
data_train.set_index('label', inplace=True)

data_train['tweet'] = data_pre['tweet_emoji_signos_pre']

In [5]:
X_train, X_test, X_sub, Y_train, Y_test = get_train_test(data_train, reduce=12)

In [6]:
from transformers import RobertaForSequenceClassification, AutoTokenizer, BertForSequenceClassification
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding

tokenizer = AutoTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased')
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [7]:
bert_model = BertForSequenceClassification.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased')

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuc

In [None]:
tuned_embeddings = {}
for target in targets[0:1]:
    bert_model = BertForSequenceClassification.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased')
    
    y_train = Y_train[target]
    y_test = Y_test[target]
    fine_tune_beto(X_train, X_test, y_train, y_test, 3)

    X_train_emb = get_embeddings(X_train)
    X_test_emb = get_embeddings(X_test)
    X_sub_emb = get_embeddings(X_sub)

    model = LogisticRegression(max_iter=1000)
    model.fit(X_train_emb, y_train)
    y_pred = model.predict_proba(X_test_emb)[:,1]
    y_pred = pd.Series(y_pred, y_test.index)

    result = f1_score(y_test.groupby(level=0).mean(), y_pred.groupby(level=0).mean().round(), average='macro')
    print('f1 {}: {}'.format(target, result))


    y_pred_bert = get_predict(X_test)
    y_pred_bert = pd.Series(y_pred_bert, y_test.index)
    
    y_sub_bert = get_predict(X_sub)
    y_sub_bert = pd.Series(y_sub_bert, X_sub.index.get_level_values(0))
    
    
    result = f1_score(y_test.groupby(level=0).mean(), y_pred_bert.groupby(level=0).mean().round(), average='macro')
    print('f1 {}: {}'.format(target, result))
    
    
    tuned_embeddings[target] = {}
    tuned_embeddings[target]['embeddings'] = {'train': X_train_emb,
                                              'test': X_test_emb,
                                              'sub': X_sub_emb}
    tuned_embeddings[target]['bert_pred'] = {'test': y_pred_bert, 
                                             'sub': y_sub_bert}

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuc

Step,Training Loss
500,0.5063
1000,0.2665


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1000/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




f1 gender: 0.7945652173913043


In [12]:
X_train_emb = tuned_embeddings['gender']['embeddings']['train']
X_test_emb = tuned_embeddings['gender']['embeddings']['test']
X_sub_emb = tuned_embeddings['gender']['embeddings']['sub']

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

In [14]:
from nltk.corpus import stopwords
stopwords = stopwords.words('spanish')

In [15]:
tf = TfidfVectorizer(analyzer = 'word',
                     min_df = 1,
                     max_features = 5000,
                     lowercase=True,
                     stop_words=stopwords)
X_train_tf = tf.fit_transform(X_train).toarray()
X_test_tf = tf.transform(X_test).toarray()
X_sub_tf = tf.transform(X_sub).toarray()

In [17]:
X_train_full = np.concatenate((X_train_emb, X_train_tf), axis=1)
X_test_full = np.concatenate((X_test_emb, X_test_tf), axis=1)
X_sub_full = np.concatenate((X_sub_emb, X_sub_tf), axis=1)

In [28]:
y_train = Y_train['gender']
y_test = Y_test['gender']

param_grid_reg = [
    {"C": [.0001, .001, .001, 0.1, 1, 2, 5, 10],
     "penalty": ["l1", "l2"]
     }
    ]

model = LogisticRegression(max_iter=2000, solver='liblinear', C=.001, penalty='l1')
model.fit(X_train_full, y_train)

gender_pred = model.predict_proba(X_test_full)[:,1]
gender_pred = pd.Series(gender_pred, X_test.index)

f1_score(y_test.groupby(level=0).mean(), gender_pred.groupby(level=0).mean().round(), average='macro')

0.7904656319290465