In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from google.colab import drive

drive.mount('/content/drive')

class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }

df = pd.read_csv('/content/drive/My Drive/train3.csv')

df = df.dropna(subset=['Text', 'category'])

df['category'] = pd.to_numeric(df['category'], errors='coerce')
df = df.dropna(subset=['category'])
df['category'] = df['category'].astype(int)

df['category'] = df['category'] + 1

df['Text'] = df['Text'].str.strip()

df_train, df_test = train_test_split(df, test_size=0.95, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(
    df_train['Text'], df_train['category'], test_size=0.2, random_state=42
)

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

max_length = 128
train_dataset = SentimentDataset(X_train.tolist(), y_train.tolist(), tokenizer, max_length)
test_dataset = SentimentDataset(X_test.tolist(), y_test.tolist(), tokenizer, max_length)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()

best_val_loss = float('inf')
patience = 2
trigger_times = 0

epochs = 4
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}/{epochs}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1} Loss: {total_loss / len(train_loader):.4f}")

    if total_loss / len(train_loader) < best_val_loss:
        best_val_loss = total_loss / len(train_loader)
        trigger_times = 0
    else:
        trigger_times += 1

    if trigger_times >= patience:
        print(f"Early stopping after epoch {epoch+1}")
        break

model.eval()
y_true = []
y_pred = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=-1)
        y_true.extend(labels.cpu().tolist())
        y_pred.extend(predictions.cpu().tolist())

y_true = [label - 1 for label in y_true]
y_pred = [label - 1 for label in y_pred]

accuracy = accuracy_score(y_true, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")

results = pd.DataFrame({
    'Text': X_test.tolist(),
    'Actual': y_true,
    'Predicted': y_pred
})
results.to_csv('/content/drive/My Drive/test3_res.csv', index=False)
print("Predictions saved to 'test3.csv'.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1/3: 100%|██████████| 386/386 [01:11<00:00,  5.42it/s]


Epoch 1 Loss: 0.7461


Training Epoch 2/3: 100%|██████████| 386/386 [01:09<00:00,  5.52it/s]


Epoch 2 Loss: 0.3396


Training Epoch 3/3: 100%|██████████| 386/386 [01:09<00:00,  5.53it/s]


Epoch 3 Loss: 0.1818


Evaluating: 100%|██████████| 97/97 [00:05<00:00, 16.17it/s]

Test Accuracy: 0.8749
Predictions saved to 'test3.csv'.





In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from google.colab import drive

drive.mount('/content/drive')

class SentimentDatasetTest(Dataset):
    def __init__(self, texts, tokenizer, max_length):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0)
        }

new_df = pd.read_csv('/content/drive/My Drive/test3.csv')

new_df['Text'] = new_df['Text'].fillna('')
new_df['Text'] = new_df['Text'].astype(str)

test_dataset = SentimentDatasetTest(new_df['Text'].tolist(), tokenizer, max_length=128)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

model.eval()

y_pred = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Predicting"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)

        logits = outputs.logits.to(device)

        predictions = torch.argmax(logits, dim=-1)

        y_pred.extend(predictions.cpu().tolist())

y_pred = [label - 1 for label in y_pred]

results = pd.DataFrame({
    'Text': new_df['Text'],
    'Predicted': y_pred
})

results.to_csv('/content/drive/My Drive/test3_predictions.csv', index=False)
print("Predictions saved to 'test3_predictions.csv'.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Predicting: 100%|██████████| 812/812 [00:49<00:00, 16.40it/s]

Predictions saved to 'test3_predictions.csv'.



