# Resultados parciales y visualizaciones estáticas

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# carga de los datos

train_dataset = pd.read_csv("data/clean_train.csv")
test_dataset = pd.read_csv("data/clean_test.csv")

print(train_dataset.head(3))
print(test_dataset.head(3))

       essay_id  discourse_id discourse_type  \
0  007ACE74B050  0013cc385424           Lead   
1  007ACE74B050  9704a709b505       Position   
2  007ACE74B050  c22adee811b6          Claim   

                                      discourse_text  \
0  Hi, i'm Isaac, i'm going to be writing about h...   
1  On my perspective, I think that the face is a ...   
2  I think that the face is a natural landform be...   

                                discourse_text_clean  \
0  hi i'm isaac i'm going to be writing about how...   
1  on my perspective i think that the face is a n...   
2  i think that the face is a natural landform be...   

                                          essay_text  \
0  Hi, i'm Isaac, i'm going to be writing about h...   
1  Hi, i'm Isaac, i'm going to be writing about h...   
2  Hi, i'm Isaac, i'm going to be writing about h...   

                                    essay_text_clean discourse_effectiveness  \
0  hi i'm isaac i'm going to be writing about how...

## SVM + TF-IDF

In [6]:
X_train = train_dataset["discourse_text_clean"]
y_train = train_dataset["label"]

# vectorización con TF-IDF
tfidf = TfidfVectorizer(
    sublinear_tf=True,
    min_df=5,
    norm='l2',
    encoding='utf-8',
    ngram_range=(1,2),
    stop_words='english'
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(test_dataset["discourse_text"])

svm = LinearSVC(random_state=42)
svm.fit(X_train_tfidf, y_train)

test_predictions = svm.predict(X_test_tfidf)
test_dataset["predicted_label"] = test_predictions

print(test_dataset[["discourse_text", "predicted_label"]].head())

                                      discourse_text  predicted_label
0  Making choices in life can be very difficult. ...                1
1  Seeking multiple opinions can help a person ma...                1
2                     it can decrease stress levels                 1
3             a great chance to learn something new                 2
4               can be very helpful and beneficial.                 1


## DistilBERT

In [3]:
X = train_dataset["discourse_text_clean"].astype(str)

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(train_dataset["discourse_effectiveness"])

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=256)
val_encodings = tokenizer(list(X_val), truncation=True, padding=True, max_length=256)
test_encodings = tokenizer(list(test_dataset["discourse_text"].astype(str)), truncation=True, padding=True, max_length=256)

class ArgumentDataset(Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

train_ds = ArgumentDataset(train_encodings, y_train)
val_ds = ArgumentDataset(val_encodings, y_val)
test_ds = ArgumentDataset(test_encodings)

train_loader = DataLoader(train_ds, batch_size=8, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=8)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(label_encoder.classes_)
).to(device)

optimizer = optim.Adam(model.parameters(), lr=2e-5)

epochs = 1
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        optimizer.zero_grad()
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1} - Training Loss: {total_loss/len(train_loader):.4f}")

    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batc)
            preds = torch.argmax(outputs.logits, dim=1)
            correct += (preds == batch["labels"]).sum().item()
            total += len(batch["labels"])
    print(f"Validation Accuracy:")

torch.save(model.state_dict(), "distilbert_trained.pt")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1:   7%|██▌                                 | 256/3677 [20:57<4:40:05,  4.91s/it]


KeyboardInterrupt: 