In [1]:
# Imports
from imblearn.pipeline import Pipeline  # imblearn's pipeline supports resampling steps
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
import os
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
torch.cuda.is_available()

True

In [2]:
directory = r"/home/camiloav/Code/HomeSecurity/Classifier/dataset/"
df = pd.DataFrame()
for name in os.listdir(directory):
    with open(os.path.join(directory, name)) as f:
        print(f"Content of '{name}'")
        csvdf = pd.read_csv(f)
        df = pd.concat([df, csvdf])
print("Original df shape:", df.shape)


Content of 'output_03.csv'
Content of 'output_02.csv'
Content of 'output_04.csv'
Content of 'output_01.csv'
Content of 'output_05.csv'
Content of 'output_06.csv'
Original df shape: (509073, 15)


In [3]:
# Create a new DataFrame with only the necessary columns and rename them
values = df[['Rating', 'Content']].copy()
values.rename(columns={'Rating': 'label', 'Content': 'text'}, inplace=True)
# Map ratings: 1 and 2 become 0; 3, 4, 5 (and NaN) become 1
values['label'] = values['label'].map({1: 0, 2: 0, 3: 1, 4: 1, 5: 1, np.nan: 1})
values.dropna(subset=['text'], inplace=True)

# Convert columns to lists for tokenization
texts = values['text'].tolist()
labels = values['label'].tolist()

# Split into training and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

In [4]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize the text (the tokenizer now gets a list of strings)
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)

# Create a PyTorch dataset
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(train_encodings, train_labels)
test_dataset = TextDataset(test_encodings, test_labels)


In [None]:
# Load the pre-trained model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    evaluation_strategy="epoch",  # Evaluation is done at the end of each epoch
    save_strategy="epoch",        # Save the model at the end of each epoch
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

# Define a simple accuracy metric
import numpy as np
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = (preds == labels).astype(np.float32).mean().item()
    return {'accuracy': accuracy}

# Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# Train the model (this will use GPU if available)
trainer.train()

# Evaluate on test data
results = trainer.evaluate()
print("Test Results:", results)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.238,0.273192,0.903177
