In [44]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sklearn.model_selection import train_test_split

In [45]:
# ------------------------------
# Load data
# ------------------------------
train_df = pd.read_csv("train_split.csv")
val_df   = pd.read_csv("val_split.csv")

# Clean text
def clean_text(text):
    text = text.lower()
    text = ''.join(c for c in text if c.isalnum() or c.isspace())
    text = ' '.join(text.split())
    return text

train_df['clean_text'] = train_df['text'].apply(clean_text)
val_df['clean_text']   = val_df['text'].apply(clean_text)

In [46]:
# ------------------------------
# Aspect multi-label encoding
# ------------------------------
mlb = MultiLabelBinarizer()
y_train_aspect = mlb.fit_transform(train_df.groupby('id')['aspectCategory'].apply(list))
y_val_aspect   = mlb.transform(val_df.groupby('id')['aspectCategory'].apply(list))

# Corresponding texts per id
X_train_aspect = train_df.groupby('id')['clean_text'].first().tolist()
X_val_aspect   = val_df.groupby('id')['clean_text'].first().tolist()

In [47]:
# ------------------------------
# Sentiment multi-class encoding
# ------------------------------
le_sent = LabelEncoder()
y_train_sent = le_sent.fit_transform(train_df['polarity'])
y_val_sent   = le_sent.transform(val_df['polarity'])
X_train_sent = train_df['aspectCategory'] + " : " + train_df['clean_text']
X_val_sent   = val_df['aspectCategory'] + " : " + val_df['clean_text']

In [49]:
# ------------------------------
# Tokenization (simple word2index)
# ------------------------------
# Convert all relevant text to list first
X_train_aspect_list = list(X_train_aspect)  # already grouped by id
X_train_sent_list   = list(X_train_sent)    # full row-wise

# Build vocabulary from training texts only
word2idx = {"<PAD>": 0}  # fixed mapping

for text in X_train_aspect_list + X_train_sent_list:
    for word in text.split():
        if word not in word2idx:
            word2idx[word] = len(word2idx)

vocab_size = len(word2idx)
print("Vocabulary size:", vocab_size)

# Tokenizer function
def tokenize(text):
    # Map unknown words to <PAD> (index 0)
    return [word2idx.get(word, 0) for word in text.split()]


Vocabulary size: 3724


In [56]:
class ABSADataset(Dataset):
    def __init__(self, texts, labels, max_len=50):
        self.texts = texts
        self.labels = labels
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokens = tokenize(self.texts[idx])
        tokens = tokens[:self.max_len] + [0]*(self.max_len - len(tokens))  # pad
        x = torch.tensor(tokens, dtype=torch.long)
        y = torch.tensor(self.labels[idx], dtype=torch.float if self.labels.ndim>1 else torch.long)
        return x, y

# Max sequence lengths
max_len_aspect = max(len(t.split()) for t in X_train_aspect)
max_len_sent   = max(len(t.split()) for t in X_train_sent)

# Datasets and loaders
train_aspect_ds = ABSADataset(X_train_aspect, y_train_aspect, max_len_aspect)
val_aspect_ds   = ABSADataset(X_val_aspect, y_val_aspect, max_len_aspect)
train_sent_ds   = ABSADataset(X_train_sent, y_train_sent, max_len_sent)
val_sent_ds     = ABSADataset(X_val_sent, y_val_sent, max_len_sent)

batch_size = 16
train_aspect_loader = DataLoader(train_aspect_ds, batch_size=batch_size, shuffle=True)
val_aspect_loader   = DataLoader(val_aspect_ds, batch_size=batch_size)
train_sent_loader   = DataLoader(train_sent_ds, batch_size=batch_size, shuffle=True)
val_sent_loader     = DataLoader(val_sent_ds, batch_size=batch_size)


In [75]:
class DAN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, task='multi-label'):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.fc1 = nn.Linear(embed_dim, hidden_dim)
        self.dropout = nn.Dropout(0.4)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.task = task

    def forward(self, x):
        emb = self.embedding(x)           # (batch, seq_len, embed_dim)
        avg_emb = emb.mean(dim=1)         # average over words
        out = F.relu(self.fc1(avg_emb))
        out = self.dropout(out)
        if self.task == 'multi-label':
            return torch.sigmoid(self.fc2(out))
        else:
            return self.fc2(out)


In [76]:
def train_model(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for x, y in loader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        y_pred = model(x)
        loss = criterion(y_pred, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * x.size(0)
    return total_loss / len(loader.dataset)

def eval_model(model, loader, criterion, device, task='multi-label'):
    model.eval()
    total_loss = 0
    all_preds, all_labels = [], []
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), y.to(device)
            y_pred = model(x)
            loss = criterion(y_pred, y)
            total_loss += loss.item() * x.size(0)
            all_preds.append(y_pred.cpu())
            all_labels.append(y.cpu())
    all_preds = torch.cat(all_preds)
    all_labels = torch.cat(all_labels)
    if task == 'multi-label':
        all_preds_bin = (all_preds > 0.5).int()
        return total_loss / len(loader.dataset), all_preds_bin, all_labels
    else:
        all_preds_class = torch.argmax(all_preds, dim=1)
        return total_loss / len(loader.dataset), all_preds_class, all_labels


In [78]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Aspect classifier
dan_aspect = DAN(vocab_size, embed_dim=300, hidden_dim=256, output_dim=y_train_aspect.shape[1], task='multi-label').to(device)
optimizer_aspect = torch.optim.Adam(dan_aspect.parameters(), lr=3e-5)
criterion_aspect = nn.BCELoss()

for epoch in range(40):
    train_loss = train_model(dan_aspect, train_aspect_loader, optimizer_aspect, criterion_aspect, device)
    val_loss, val_preds_aspect, val_labels_aspect = eval_model(dan_aspect, val_aspect_loader, criterion_aspect, device)
    print(f"Epoch {epoch+1}: Train Loss={train_loss:.4f}, Val Loss={val_loss:.4f}")



Epoch 1: Train Loss=0.6929, Val Loss=0.6852
Epoch 2: Train Loss=0.6803, Val Loss=0.6706
Epoch 3: Train Loss=0.6659, Val Loss=0.6530
Epoch 4: Train Loss=0.6483, Val Loss=0.6319
Epoch 5: Train Loss=0.6279, Val Loss=0.6082
Epoch 6: Train Loss=0.6071, Val Loss=0.5849
Epoch 7: Train Loss=0.5873, Val Loss=0.5644
Epoch 8: Train Loss=0.5710, Val Loss=0.5480
Epoch 9: Train Loss=0.5577, Val Loss=0.5356
Epoch 10: Train Loss=0.5478, Val Loss=0.5265
Epoch 11: Train Loss=0.5396, Val Loss=0.5196
Epoch 12: Train Loss=0.5335, Val Loss=0.5142
Epoch 13: Train Loss=0.5285, Val Loss=0.5098
Epoch 14: Train Loss=0.5236, Val Loss=0.5060
Epoch 15: Train Loss=0.5187, Val Loss=0.5027
Epoch 16: Train Loss=0.5139, Val Loss=0.4994
Epoch 17: Train Loss=0.5102, Val Loss=0.4964
Epoch 18: Train Loss=0.5057, Val Loss=0.4935
Epoch 19: Train Loss=0.5026, Val Loss=0.4908
Epoch 20: Train Loss=0.4979, Val Loss=0.4882
Epoch 21: Train Loss=0.4952, Val Loss=0.4856
Epoch 22: Train Loss=0.4902, Val Loss=0.4831
Epoch 23: Train Los

In [82]:
# Sentiment classifier
dan_sent = DAN(vocab_size, embed_dim=300, hidden_dim=256, output_dim=len(le_sent.classes_), task='multi-class').to(device)
optimizer_sent = torch.optim.Adam(dan_sent.parameters(), lr=1e-4)
criterion_sent = nn.CrossEntropyLoss()

for epoch in range(30):
    train_loss = train_model(dan_sent, train_sent_loader, optimizer_sent, criterion_sent, device)
    val_loss, val_preds_sent, val_labels_sent = eval_model(dan_sent, val_sent_loader, criterion_sent, device, task='multi-class')
    print(f"Epoch {epoch+1}: Train Loss={train_loss:.4f}, Val Loss={val_loss:.4f}")


Epoch 1: Train Loss=1.3176, Val Loss=1.2156
Epoch 2: Train Loss=1.1361, Val Loss=1.0786
Epoch 3: Train Loss=1.0603, Val Loss=1.0475
Epoch 4: Train Loss=1.0369, Val Loss=1.0297
Epoch 5: Train Loss=1.0132, Val Loss=1.0128
Epoch 6: Train Loss=0.9931, Val Loss=0.9956
Epoch 7: Train Loss=0.9719, Val Loss=0.9795
Epoch 8: Train Loss=0.9465, Val Loss=0.9618
Epoch 9: Train Loss=0.9276, Val Loss=0.9463
Epoch 10: Train Loss=0.9018, Val Loss=0.9316
Epoch 11: Train Loss=0.8811, Val Loss=0.9189
Epoch 12: Train Loss=0.8634, Val Loss=0.9059
Epoch 13: Train Loss=0.8430, Val Loss=0.8950
Epoch 14: Train Loss=0.8252, Val Loss=0.8848
Epoch 15: Train Loss=0.8012, Val Loss=0.8763
Epoch 16: Train Loss=0.7922, Val Loss=0.8679
Epoch 17: Train Loss=0.7692, Val Loss=0.8602
Epoch 18: Train Loss=0.7515, Val Loss=0.8537
Epoch 19: Train Loss=0.7359, Val Loss=0.8481
Epoch 20: Train Loss=0.7170, Val Loss=0.8432
Epoch 21: Train Loss=0.7029, Val Loss=0.8383
Epoch 22: Train Loss=0.6875, Val Loss=0.8339
Epoch 23: Train Los

In [None]:
pred_rows = []
aspect_labels = mlb.classes_

dan_aspect.eval()
dan_sent.eval()

with torch.no_grad():
    for i, text in enumerate(X_val_aspect):
        tokens = tokenize(text)
        tokens = tokens[:max_len_aspect] + [0]*(max_len_aspect - len(tokens))
        x_tensor = torch.tensor([tokens], dtype=torch.long).to(device)
        aspect_probs = dan_aspect(x_tensor)
        aspect_pred = (aspect_probs > 0.5)[0].cpu().numpy()
        for j, val in enumerate(aspect_pred):
            if val == 1:
                aspect_name = aspect_labels[j]
                input_text = aspect_name + " : " + text
                tokens_sent = tokenize(input_text)
                tokens_sent = tokens_sent[:max_len_sent] + [0]*(max_len_sent - len(tokens_sent))
                x_sent = torch.tensor([tokens_sent], dtype=torch.long).to(device)
                pred_sent = torch.argmax(dan_sent(x_sent), dim=1).item()
                pred_sent_label = le_sent.inverse_transform([pred_sent])[0]
                pred_rows.append({"id": val_df.iloc[i]["id"], "aspectCategory": aspect_name, "polarity": pred_sent_label})

val_pred = pd.DataFrame(pred_rows)
val_pred.to_csv("val_pred_DAN_PyTorch.csv", index=False)
val_df[['id', 'aspectCategory', 'polarity']].to_csv("val_truth.csv", index=False)
