In [1]:
seed = 0
targets = ['gender', 'profession','ideology_binary', 'ideology_multiclass']

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import copy

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


from sentence_transformers import SentenceTransformer
from datasets import Dataset
from datasets import load_dataset

from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer
from pysentimiento.preprocessing import preprocess_tweet


from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import f1_score

def f1s(preds, dtrain):
    f1 = f1_score(dtrain.get_label(), np.round(preds), average='macro')
    return 'f1', f1

def get_train_test(data_train, reduce=False):
    users_train, users_test = train_test_split(data_train.index.unique(), test_size=.2, random_state=19970808)
    
    if reduce:
        data_train['idd'] = np.array(range(data_train.shape[0])) % reduce
        df_tweets = data_train.reset_index().groupby(by=['label', 'idd']).tweet.apply(lambda x: ' '.join(x))
        df = data_train.reset_index().groupby(by=['label', 'idd']).first()
        
        
        data_test['idd'] = np.array(range(data_test.shape[0])) % reduce
        df_test = data_test.reset_index().groupby(by=['label', 'idd']).tweet.apply(lambda x: ' '.join(x))
        
    else:
        df = data_train
        df_tweets = data_train.tweet
    
    df_targets = df[targets].apply(LabelEncoder().fit_transform)
    
    X_train = df_tweets.loc[users_train]
    Y_train = df_targets.loc[users_train]
    Y_train = Y_train.droplevel(1)

    X_test = df_tweets.loc[users_test]
    Y_test = df_targets.loc[users_test]
    Y_test = Y_test.droplevel(1)
    
    return X_train, X_test, df_test, Y_train, Y_test

# Bert utils
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)

def fine_tune_beto(X_train, X_test, y_train, y_test, iters):
    dataset_train = Dataset.from_pandas(pd.DataFrame({'text': X_train.values, 'labels': y_train.values})).shuffle()
    dataset_test = Dataset.from_pandas(pd.DataFrame({'text': X_test.values, 'labels': y_test.values})).shuffle()

    dataset_train_tok = dataset_train.map(preprocess_function, batched=True)
    dataset_test_tok = dataset_test.map(preprocess_function, batched=True)

    training_args = TrainingArguments(
        output_dir="./results",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=iters,
        weight_decay=0.01, 
        evaluation_strategy='steps',
        eval_steps=50,
    )

    trainer = Trainer(
        model=bert_model,
        args=training_args,
        train_dataset=dataset_train_tok,
        eval_dataset=dataset_test_tok,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    old_collator = trainer.data_collator
    trainer.data_collator = lambda data: dict(old_collator(data))

    trainer.train()
    
def get_embeddings(X):
    X_emb = []
    for txt in X:
        encoding = tokenizer.encode_plus(txt, 
                                         add_special_tokens=True, 
                                         truncation=True, 
                                         padding = "max_length", 
                                         return_attention_mask=True, 
                                         return_tensors="pt")

        encoding = encoding.to(device)

        output = bert_model.base_model(**encoding)
        embeddings_tensor = output.pooler_output
        embeddings = embeddings_tensor.tolist()[0]
        X_emb.append(embeddings)
    X_emb = np.array(X_emb)
    
    return X_emb

def get_predict(X):
    X_emb = []
    for txt in X:
        encoding = tokenizer.encode_plus(txt, 
                                         add_special_tokens=True, 
                                         truncation=True, 
                                         padding = "max_length", 
                                         return_attention_mask=True, 
                                         return_tensors="pt")

        encoding = encoding.to(device)

        output = bert_model(**encoding)
        pre = torch.argmax(output.logits).tolist()
        
        X_emb.append(pre)
    X_emb = np.array(X_emb)
    
    return X_emb


def create_submission(users, gender, prof, ideobi, ideomul):
    header = ['user','gender','profession','ideology_binary','ideology_multiclass']
    

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [6]:
data_pre = pd.read_csv("../data/raw/tweets_complete.csv")
data_pre.set_index('label', inplace=True)

data_train = pd.read_csv("../data/raw/training.csv")
data_dev = pd.read_csv('../data/raw/development.csv')
data_dev_test = pd.read_csv('../data/raw/development_test.csv')
data_test = pd.read_csv("../data/raw/test_without_labels.csv")

data_test.drop(columns='Unnamed: 0', inplace=True)
data_test.set_index('label', inplace=True)
data_train.set_index('label', inplace=True)

data_train['tweet'] = data_pre['tweet_emoji_signos_pre']

In [12]:
X_train, X_test, X_sub, Y_train, Y_test = get_train_test(data_train, reduce=8)

In [8]:
from transformers import RobertaForSequenceClassification, AutoTokenizer, BertForSequenceClassification
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding

tokenizer = AutoTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased')
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [9]:
bert_model = BertForSequenceClassification.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased')

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuc

In [13]:
tuned_embeddings = {}
for target in targets[1:2]:
    bert_model = BertForSequenceClassification.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased')
    
    y_train = Y_train[target]
    y_test = Y_test[target]
    fine_tune_beto(X_train, X_test, y_train, y_test, 4)

    X_train_emb = get_embeddings(X_train)
    X_test_emb = get_embeddings(X_test)
    X_sub_emb = get_embeddings(X_sub)

    model = LogisticRegression(max_iter=1000)
    model.fit(X_train_emb, y_train)
    y_pred = model.predict_proba(X_test_emb)[:,1]
    y_pred = pd.Series(y_pred, y_test.index)

    result = f1_score(y_test.groupby(level=0).mean(), y_pred.groupby(level=0).mean().round(), average='macro')
    print('f1 {}: {}'.format(target, result))


    y_pred_bert = get_predict(X_test)
    y_pred_bert = pd.Series(y_pred_bert, y_test.index)
    
    y_sub_bert = get_predict(X_sub)
    y_sub_bert = pd.Series(y_sub_bert, X_sub.index.get_level_values(0))
    
    
    result = f1_score(y_test.groupby(level=0).mean(), y_pred_bert.groupby(level=0).mean().round(), average='macro')
    print('f1 {}: {}'.format(target, result))
    
    
    tuned_embeddings[target] = {}
    tuned_embeddings[target]['embeddings'] = {'train': X_train_emb,
                                              'test': X_test_emb,
                                              'sub': X_sub_emb}
    tuned_embeddings[target]['bert_pred'] = {'test': y_pred_bert, 
                                             'sub': y_sub_bert}

loading configuration file https://huggingface.co/dccuchile/bert-base-spanish-wwm-uncased/resolve/main/config.json from cache at /home/azaelcarrillo/.cache/huggingface/transformers/2416dab24674c27b5521594d6aa0929fc843a024c96711b1b5015cdff867291f.afa3630b664b4bd3e82d41660bdb96ec13236bbceadb0ae7c45c7c19f58652c7
Model config BertConfig {
  "_name_or_path": "dccuchile/bert-base-spanish-wwm-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.11.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab

Step,Training Loss,Validation Loss
50,No log,0.511105
100,No log,0.415316
150,No log,0.27331
200,No log,0.249634
250,No log,0.490099
300,No log,0.586934
350,No log,0.396453
400,No log,0.28238
450,No log,0.405658
500,0.233800,0.425961


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 504
  Batch size = 8
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 504
  Batch size = 8
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 504
  Batch size = 8
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 504
  Batch size = 8
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassificati

f1 profession: 0.8752475247524752
f1 profession: 0.8546153846153846


In [92]:
y_train = Y_train['ideology_multiclass']
y_test = Y_test['ideology_multiclass']

model = LogisticRegression(max_iter=1000)
model.fit(X_train_emb, y_train)
y_pred = model.predict_proba(X_sub_emb)
y_pred = np.argmax(y_pred, axis=1)
y_pred = pd.Series(y_pred, X_sub.index)

result = f1_score(y_test.groupby(level=0).mean(), y_pred.groupby(level=0).mean().round(), average='macro')
result

ValueError: Found input variables with inconsistent numbers of samples: [63, 105]

In [76]:
labels ={}
labels['gender'] = {0:'female', 1:'male'}
labels['profession'] = {0:'journalist', 1:'politician'}
labels['ideology_binary'] = {0:'left', 1:'right'}
labels['ideology_multiclass'] = {0:'left', 1:'moderate_left', 2:'moderate_right', 3:'right'}

In [81]:
header = ['user','gender','profession','ideology_binary','ideology_multiclass']

In [94]:
pred = tuned_embeddings['gender']['bert_pred']['sub']
gender_pred = pred.groupby(level=0).mean().round().astype(int).replace(labels['gender'])

pred = tuned_embeddings['profession']['bert_pred']['sub']
profession_pred = pred.groupby(level=0).mean().round().astype(int).replace(labels['profession'])

pred = tuned_embeddings['ideology_binary']['bert_pred']['sub']
bi_pred = pred.groupby(level=0).mean().round().astype(int).replace(labels['ideology_binary'])


mu_pred = y_pred.groupby(level=0).mean().round().astype(int).replace(labels['ideology_multiclass'])

In [99]:
subm = pd.DataFrame({'user':gender_pred.index, 'gender':gender_pred, 'profession':profession_pred, 'ideology_binary':bi_pred, 'ideology_multiclass':mu_pred})
subm.to_csv('results.csv', sep=',')

In [23]:
X_train_emb = tuned_embeddings['gender']['embeddings']['train']
X_test_emb = tuned_embeddings['gender']['embeddings']['test']
X_sub_emb = tuned_embeddings['gender']['embeddings']['sub']

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

In [26]:
from nltk.corpus import stopwords
stopwords = stopwords.words('spanish')

In [27]:
tf = TfidfVectorizer(analyzer = 'word',
                     min_df = 1,
                     max_features = 5000,
                     lowercase=True,
                     stop_words=stopwords)
X_train_tf = tf.fit_transform(X_train).toarray()
X_test_tf = tf.transform(X_test).toarray()
X_sub_tf = tf.transform(X_sub).toarray()

In [79]:
X_sub_tf = tf.transform(X_sub).toarray()

In [80]:
X_train_full = np.concatenate((X_train_emb, X_train_tf), axis=1)
X_test_full = np.concatenate((X_test_emb, X_test_tf), axis=1)
X_sub_full = np.concatenate((X_sub_emb, X_sub_tf), axis=1)

In [81]:
y_train = Y_train['gender']
y_test = Y_test['gender']

param_grid_reg = [
    {"C": [.0001, .001, .001, 0.1, 1, 2, 5, 10],
     "penalty": ["l1", "l2"]
     }
    ]

model = LogisticRegression(max_iter=2000, solver='liblinear', C=.001, penalty='l1')
model.fit(X_train_full, y_train)

gender_pred = model.predict_proba(X_sub_full)[:,1]
gender_pred = pd.Series(gender_pred, X_sub.index)

f1_score(y_test.groupby(level=0).mean(), gender_pred.groupby(level=0).mean().round(), average='macro')

ValueError: Found input variables with inconsistent numbers of samples: [63, 105]

In [74]:
ress = pd.read_csv('results.csv').drop(columns='label')

In [83]:
ress.gender = gender_pred.groupby(level=0).mean().round().astype(int).replace(labels['gender'])

In [84]:
ress.to_csv('results.csv', sep=',')