# Balanceo de datos

# Modelo Bert

In [47]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

# Cargar los datos
data = pd.read_csv("./train.csv")
min_count = data['discourse_effectiveness'].value_counts().min()

# Filtrar las categorías 'adequate' y 'effective' para igualar el tamaño de 'ineffective'.
adequate_data = data[data['discourse_effectiveness'] == 'Adequate']
effective_data = data[data['discourse_effectiveness'] == 'Effective']

# Si hay más elementos que la cantidad mínima, eliminar los extras
if len(adequate_data) > min_count:
    adequate_data = adequate_data.sample(min_count, replace=False)

if len(effective_data) > min_count:
    effective_data = effective_data.sample(min_count, replace=False)

# Concat DataFrames filtrados
data = pd.concat([adequate_data, effective_data, data[data['discourse_effectiveness'] == 'Ineffective']])

data.reset_index(drop=True, inplace=True)

data.drop('discourse_id', axis=1, inplace=True)
data.drop('essay_id', axis=1, inplace=True)
data['index'] = data.index

claim_sizes = [len(text) for text in data['discourse_text']]
data['claim_size'] = claim_sizes
texto_original = data['discourse_text']
# data.drop('discourse_text', axis=1, inplace=True)

# Encodign
type_map = {cat:index for index, cat in enumerate(data['discourse_type'].unique())}
data['discourse_type'] = [type_map[cat] for cat in data['discourse_type']]

type_map = {cat:index for index, cat in enumerate(data['discourse_effectiveness'].unique())}
data['discourse_effectiveness'] = [type_map[cat] for cat in data['discourse_effectiveness']]

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_length = 128

encoded_data = tokenizer(data['discourse_text'].tolist(), truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')

# Eliminar 'discourse_text'
data.drop('discourse_text', axis=1, inplace=True)

# Dividir los datos en entrenamiento y prueba
X = encoded_data['input_ids']
y = torch.tensor(data['discourse_effectiveness'].values)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Crear el modelo BERT pre-entrenado para clasificación
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)  # 3 etiquetas de clasificación

# Crear dataloaders para el entrenamiento
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 6

train_dataset = TensorDataset(X_train, y_train)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size)

test_dataset = TensorDataset(X_test, y_test)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=batch_size)

# Parámetros de entrenamiento
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Entrenar el modelo
from transformers import get_scheduler

epochs = 3
total_steps = len(train_dataloader) * epochs

scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

i = 0
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0]
        b_labels = batch[1]

        model.zero_grad()
        outputs = model(b_input_ids, labels=b_labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch+1}/{epochs}, Average Training Loss: {avg_train_loss:.4f}')

# Evaluar el modelo
model.eval()
total_eval_accuracy = 0
total_eval_loss = 0
nb_eval_steps = 0

for batch in test_dataloader:
    b_input_ids = batch[0]
    b_labels = batch[1]

    with torch.no_grad():
        outputs = model(b_input_ids, labels=b_labels)
        loss = outputs.loss
        logits = outputs.logits

    total_eval_loss += loss.item()
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    total_eval_accuracy += (logits.argmax(axis=1) == label_ids).sum()

avg_val_accuracy = total_eval_accuracy / len(X_test)
print(f'Accuracy on test data: {avg_val_accuracy:.2%}')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Average Training Loss: 0.8708
Epoch 2/3, Average Training Loss: 0.6770
Epoch 3/3, Average Training Loss: 0.4574
Accuracy on test data: 61.06%


In [78]:
from transformers import BertForSequenceClassification, BertTokenizer
import torch
import torch
import pickle
from transformers import BertForSequenceClassification, BertTokenizer

with open('bert_model.pkl', 'rb') as f:
    data = pickle.load(f)
    model_state_dict = data['model_state_dict']
    tokenizer = data['tokenizer']

model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
model.load_state_dict(model_state_dict)

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Cargar el modelo y el tokenizador
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Oración de prueba
text = "On my perspective, I think that the face is a natural landform because I dont think that there is any life on Mars. In these next few paragraphs, I'll be talking about how I think that is is a natural landform "

# Tokeniza la oración  y conviértela en un tensor de entrada
tokens = tokenizer(text, return_tensors='pt', truncation=True, padding='max_length', max_length=128)
input_ids = tokens['input_ids']
attention_mask = tokens['attention_mask']

# Realiza la predicción
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits


# Encuentra la clase con la puntuación más alta
indice_maximo = torch.argmax(logits, dim=1).item()

# Definir un diccionario de mapeo
mapeo_etiquetas = {
    0: "Adequate",
    1: "Effective",
    2: "Ineffective"
}

# Obtener la etiqueta predicha del diccionario de mapeo
etiqueta_predicha = mapeo_etiquetas[indice_maximo]

# Imprimir la etiqueta predicha
print("Etiqueta:", etiqueta_predicha)
