Loading the datasets to a dataframe

In [1]:
#Load the datasets
import json
import pandas as pd

# Load JSON data into a Python dictionary
with open('train.json', 'r') as f:
    data = json.load(f)

# Extract data from the dictionary
columns, index, data_values = data["columns"], data["index"], data["data"]

# Create a DataFrame using the extracted data
train_df = pd.DataFrame(data_values, columns=columns, index=index)

# Load JSON data into a Python dictionary for val.json
with open('val.json', 'r') as f:
    val_data = json.load(f)

# Extract data from the val.json dictionary
val_columns, val_index, val_data_values = val_data["columns"], val_data["index"], val_data["data"]

# Create a DataFrame for val.json using the extracted data
val_df = pd.DataFrame(val_data_values, columns=val_columns, index=val_index)

# Load JSON data into a Python dictionary for test.json
with open('test.json', 'r') as f:
    test_data = json.load(f)

# Extract data from the test.json dictionary
test_columns, test_index, test_data_values = test_data["columns"], test_data["index"], test_data["data"]

# Create a DataFrame for test.json using the extracted data
test_df = pd.DataFrame(test_data_values, columns=test_columns, index=test_index)

#check whether the data is loaded successfully
print(train_df.head())
print(val_df.head())
print(test_df.head())

                                            sentence         aspect  polarity
0  It might be the best sit down food I've had in...           food  positive
1  It might be the best sit down food I've had in...          place   neutral
2  Hostess was extremely accommodating when we ar...          staff  positive
3  Hostess was extremely accommodating when we ar...  miscellaneous   neutral
4  We were a couple of minutes late for our reser...  miscellaneous   neutral
                                            sentence         aspect  polarity
0  I would wait for a table next time, the food w...  miscellaneous   neutral
1  I would wait for a table next time, the food w...           food  positive
2  We did complain to the manager, but she just s...          staff  negative
3  We did complain to the manager, but she just s...           food   neutral
4  the service was inattentive (didn't bring us w...        service  negative
                                            sentence   aspect  p

Data Preprocessing

In [2]:
import spacy
import re
import nltk
from nltk.corpus import stopwords as sw
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import inflect

# Function to remove punctuation using regular expressions
def remove_punctuation_re(x):
    x = re.sub(r'[^\w\s]', '', x)
    return x
# Apply punctuation removal to the 'sentence' column in train_df, val_df, and test_df
train_df['preprocessed_sentence'] = train_df['sentence'].apply(remove_punctuation_re)
val_df['preprocessed_sentence'] = val_df['sentence'].apply(remove_punctuation_re)
test_df['preprocessed_sentence'] = test_df['sentence'].apply(remove_punctuation_re)

# Download NLTK resources
#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('averaged_perceptron_tagger')

# Initialize WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Function for tokenization and stopwords removal
def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Tokenize and convert to lowercase
    stop_words = set(sw.words())  # Get English stopwords
    filtered_tokens = [w for w in tokens if not w in stop_words]  # Remove stopwords
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]  # Lemmatize tokens
    preprocessed_text = ' '.join(lemmatized_tokens)  # Join tokens back into a string
    return preprocessed_text
# Apply preprocessing to the 'preprocessed_sentence' column in train_df, val_df, and test_df
train_df['preprocessed_sentence'] = train_df['preprocessed_sentence'].apply(preprocess_text)
val_df['preprocessed_sentence'] = val_df['preprocessed_sentence'].apply(preprocess_text)
test_df['preprocessed_sentence'] = test_df['preprocessed_sentence'].apply(preprocess_text)

# Initialize the inflect engine
p = inflect.engine()

# Function for text normalization
def normalize_text(text):
    # Normalize numbers: replace digits with their word forms
    words = text.split()
    normalized_words = []
    for word in words:
        if word.isdigit():
            normalized_word = p.number_to_words(word)
            normalized_words.append(normalized_word)
        else:
            normalized_words.append(word)
    normalized_text = ' '.join(normalized_words)
    
    # Normalize dates: replace dates with a placeholder token
    normalized_text = re.sub(r'\b\d{1,2}/\d{1,2}/\d{2,4}\b', 'DATE', normalized_text)  # Example: 1/1/21, 10/31/2022, etc.
    
    # Add more normalization rules as needed
    
    return normalized_text

# Apply text normalization to the 'preprocessed_sentence' column in train_df, val_df, and test_df
train_df['preprocessed_sentence'] = train_df['preprocessed_sentence'].apply(normalize_text)
val_df['preprocessed_sentence'] = val_df['preprocessed_sentence'].apply(normalize_text)
test_df['preprocessed_sentence'] = test_df['preprocessed_sentence'].apply(normalize_text)

In [3]:
# Tokenize and tag the text data using NLTK
def pos_tagging(text):
    tokens = nltk.word_tokenize(text)  # Tokenize the text
    pos_tags = nltk.pos_tag(tokens)  # Perform POS tagging
    return pos_tags

# Apply POS tagging to the 'preprocessed_sentence' column in train_df, val_df, and test_df
train_df['pos_tags'] = train_df['preprocessed_sentence'].apply(pos_tagging)
val_df['pos_tags'] = val_df['preprocessed_sentence'].apply(pos_tagging)
test_df['pos_tags'] = test_df['preprocessed_sentence'].apply(pos_tagging)

In [4]:
print(train_df.columns)
print(train_df.preprocessed_sentence.head())
print(train_df.pos_tags.head())

Index(['sentence', 'aspect', 'polarity', 'preprocessed_sentence', 'pos_tags'], dtype='object')
0    food ive area upright citizen brigade garden p...
1    food ive area upright citizen brigade garden p...
2    hostess extremely accommodating arrived hour e...
3    hostess extremely accommodating arrived hour e...
4    couple minute late reservation minus guest des...
Name: preprocessed_sentence, dtype: object
0    [(food, NN), (ive, JJ), (area, NN), (upright, ...
1    [(food, NN), (ive, JJ), (area, NN), (upright, ...
2    [(hostess, NN), (extremely, RB), (accommodatin...
3    [(hostess, NN), (extremely, RB), (accommodatin...
4    [(couple, JJ), (minute, NN), (late, JJ), (rese...
Name: pos_tags, dtype: object


In [5]:
print(val_df.columns)
print(val_df.preprocessed_sentence.head())
print(val_df.pos_tags.head())

Index(['sentence', 'aspect', 'polarity', 'preprocessed_sentence', 'pos_tags'], dtype='object')
0                                 wait table time food
1                                 wait table time food
2          complain manager problem kitchen drink bill
3          complain manager problem kitchen drink bill
4    service inattentive bring wine course served o...
Name: preprocessed_sentence, dtype: object
0    [(wait, NN), (table, JJ), (time, NN), (food, NN)]
1    [(wait, NN), (table, JJ), (time, NN), (food, NN)]
2    [(complain, NN), (manager, NN), (problem, NN),...
3    [(complain, NN), (manager, NN), (problem, NN),...
4    [(service, NN), (inattentive, JJ), (bring, JJ)...
Name: pos_tags, dtype: object


In [6]:
print(test_df.columns)
print(test_df.preprocessed_sentence.head())
print(test_df.pos_tags.head())

Index(['sentence', 'aspect', 'polarity', 'preprocessed_sentence', 'pos_tags'], dtype='object')
0    sat bar time five pint guinness buyback ordere...
1    sat bar time five pint guinness buyback ordere...
2                      food worth waitor lousy service
3                      food worth waitor lousy service
4                    waiter drink order fifteen minute
Name: preprocessed_sentence, dtype: object
0    [(sat, JJ), (bar, NN), (time, NN), (five, CD),...
1    [(sat, JJ), (bar, NN), (time, NN), (five, CD),...
2    [(food, NN), (worth, NN), (waitor, NN), (lousy...
3    [(food, NN), (worth, NN), (waitor, NN), (lousy...
4    [(waiter, NN), (drink, NN), (order, NN), (fift...
Name: pos_tags, dtype: object


In [26]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim

# Define constants
embedding_dim = 100  # Example embedding dimension
aspect_size = 8      # Number of unique aspects
output_size = 3      # Number of sentiment classes
pos_embedding_dim = 50  # Embedding dimension for POS tags
hidden_size = 64

# Build vocabulary
word_to_idx = {}
idx_to_word = {}
word_idx = 0

for dataframe in [train_df, val_df, test_df]:
    for sentence in dataframe['preprocessed_sentence']:
        for word in sentence.split():
            if word not in word_to_idx:
                word_to_idx[word] = word_idx
                idx_to_word[word_idx] = word
                word_idx += 1

# Add padding token to vocabulary
word_to_idx['<PAD>'] = word_idx
idx_to_word[word_idx] = '<PAD>'

# Define the size of the vocabulary
vocab_size = len(word_to_idx)

# Define a mapping from sentiment labels to indices
label_to_idx = {'negative': 0, 'neutral': 1, 'positive': 2}

# Create a mapping from pos tags to indices
all_pos_tags = set()
for dataframe in [train_df, val_df, test_df]:
    dataframe['pos_tags'].apply(lambda tags: all_pos_tags.update(tag for _, tag in tags))
pos_to_idx = {tag: idx for idx, tag in enumerate(all_pos_tags)}
pos_to_idx['<UNK>'] = len(pos_to_idx)  # Add unknown token for any tags not seen in training
pos_vocab_size = len(pos_to_idx)

In [27]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, word_to_idx, label_to_idx, aspect_to_idx, pos_to_idx):
        self.dataframe = dataframe
        self.word_to_idx = word_to_idx
        self.label_to_idx = label_to_idx
        self.aspect_to_idx = aspect_to_idx
        self.pos_to_idx = pos_to_idx

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        input_sentence = self.dataframe['preprocessed_sentence'].iloc[idx]
        aspect = self.dataframe['aspect'].iloc[idx]
        label = self.dataframe['polarity'].iloc[idx]
        pos_tags = self.dataframe['pos_tags'].iloc[idx]

        # Convert input sentence to numerical format (example conversion to indices)
        input_indices = [self.word_to_idx.get(word, self.word_to_idx['<PAD>']) for word in input_sentence.split()]
        input_tensor = torch.tensor(input_indices, dtype=torch.long)

        # Convert aspect to numerical format (example conversion to index)
        aspect_tensor = torch.tensor(self.aspect_to_idx[aspect], dtype=torch.long)

        # Convert label to tensor (convert string label to index)
        label_tensor = torch.tensor(self.label_to_idx[label], dtype=torch.long)

        # Convert pos tags to numerical format
        pos_indices = [self.pos_to_idx.get(pos, self.pos_to_idx['<UNK>']) for word, pos in pos_tags]
        pos_tensor = torch.tensor(pos_indices, dtype=torch.long)

        sample = {
            'input': input_tensor,
            'aspect': aspect_tensor,
            'label': label_tensor,
            'pos_tags': pos_tensor
        }
        return sample

        
#define padding function
def collate_fn(batch):
    inputs = [item['input'] for item in batch]
    aspects = [item['aspect'] for item in batch]
    labels = [item['label'] for item in batch]
    pos_tags = [item['pos_tags'] for item in batch]  # Add pos_tags to the collate function

    inputs_padded = torch.nn.utils.rnn.pad_sequence(inputs, batch_first=True, padding_value=word_to_idx['<PAD>'])
    aspects_padded = torch.stack(aspects)
    labels_padded = torch.stack(labels)
    pos_tags_padded = torch.nn.utils.rnn.pad_sequence(pos_tags, batch_first=True, padding_value=pos_to_idx['<UNK>'])  # Add padding for pos_tags

    return {'input': inputs_padded, 'aspect': aspects_padded, 'label': labels_padded, 'pos_tags': pos_tags_padded}


In [28]:
# Model Variant 1: LSTM with Aspect Concatenation
class LSTMWithAspectConcatenation(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, output_size, aspect_size):
        super(LSTMWithAspectConcatenation, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.aspect_embedding = nn.Embedding(aspect_size, hidden_size)  # Aspect embedding
        self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size * 2, output_size)

    def forward(self, input_tensor, aspect_tensor):
        embedded = self.embedding(input_tensor)
        lstm_output, _ = self.lstm(embedded)
        aspect_embed = self.aspect_embedding(aspect_tensor)
        combined = torch.cat((lstm_output[:, -1, :], aspect_embed), dim=1)
        output = self.fc(combined)
        return output

# Model Variant 2: LSTM with Aspect Attention
class LSTMWithAspectAttention(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, output_size, aspect_size):
        super(LSTMWithAspectAttention, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.aspect_embedding = nn.Embedding(aspect_size, hidden_size)  # Aspect embedding
        self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True)
        self.attn = nn.Linear(hidden_size * 2, 1)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, input_tensor, aspect_tensor):
        embedded = self.embedding(input_tensor)
        lstm_output, _ = self.lstm(embedded)
        aspect_embed = self.aspect_embedding(aspect_tensor).unsqueeze(1).repeat(1, lstm_output.size(1), 1)
        combined = torch.cat((lstm_output, aspect_embed), dim=2)
        attn_weights = torch.softmax(self.attn(combined), dim=1)
        attn_output = torch.sum(attn_weights * lstm_output, dim=1)
        output = self.fc(attn_output)
        return output

# Model Variant 3: LSTM with Aspect Fusion using POS Tagging
class LSTMWithAspectFusionPOS(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, output_size, pos_vocab_size, pos_embedding_dim, aspect_size):
        super(LSTMWithAspectFusionPOS, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.pos_embedding = nn.Embedding(pos_vocab_size, pos_embedding_dim)
        self.aspect_embedding = nn.Embedding(aspect_size, hidden_size)  # Aspect embedding
        self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, input_tensor, pos_tensor, aspect_tensor):
        embedded = self.embedding(input_tensor)
        pos_embed = self.pos_embedding(pos_tensor)
        lstm_output, _ = self.lstm(embedded)
        aspect_embed = self.aspect_embedding(aspect_tensor).unsqueeze(1).repeat(1, lstm_output.size(1), 1)
        fused_output = lstm_output + aspect_embed
        output = self.fc(fused_output[:, -1, :])
        return output

In [29]:
# Create a mapping from aspect labels to indices
aspect_to_idx = {aspect: idx for idx, aspect in enumerate(train_df['aspect'].unique())}

# Create datasets and dataloaders
train_dataset = CustomDataset(train_df, word_to_idx, label_to_idx, aspect_to_idx, pos_to_idx)
val_dataset = CustomDataset(val_df, word_to_idx, label_to_idx, aspect_to_idx, pos_to_idx)
test_dataset = CustomDataset(test_df, word_to_idx, label_to_idx, aspect_to_idx, pos_to_idx)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10, pos_tensor_required=False):
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        for batch in train_loader:
            inputs, aspects, labels = batch['input'], batch['aspect'], batch['label']
            if pos_tensor_required:
                pos_tensor = batch['pos_tags']  # Ensure `pos_tags` is included in the batch
                optimizer.zero_grad()
                outputs = model(inputs, pos_tensor, aspects)
            else:
                optimizer.zero_grad()
                outputs = model(inputs, aspects)
            
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * inputs.size(0)

        train_loss = train_loss / len(train_loader.dataset)

        # Validation
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for batch in val_loader:
                inputs, aspects, labels = batch['input'], batch['aspect'], batch['label']
                if pos_tensor_required:
                    pos_tensor = batch['pos_tags']
                    outputs = model(inputs, pos_tensor, aspects)
                else:
                    outputs = model(inputs, aspects)
                
                loss = criterion(outputs, labels)
                val_loss += loss.item() * inputs.size(0)

        val_loss = val_loss / len(val_loader.dataset)

        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')

# Define your models
model1 = LSTMWithAspectConcatenation(vocab_size, embedding_dim, hidden_size, output_size, len(aspect_to_idx))
model2 = LSTMWithAspectAttention(vocab_size, embedding_dim, hidden_size, output_size, len(aspect_to_idx))
model3 = LSTMWithAspectFusionPOS(vocab_size, embedding_dim, hidden_size, output_size, pos_vocab_size, pos_embedding_dim, len(aspect_to_idx))

# Define your loss function
criterion = nn.CrossEntropyLoss()

# Define your optimizer and specify model parameters and learning rate
optimizer1 = optim.Adam(model1.parameters(), lr=0.001)
optimizer2 = optim.Adam(model2.parameters(), lr=0.001)
optimizer3 = optim.Adam(model3.parameters(), lr=0.001)

# Train models
print(f'\nTraining LSTMWithAspectConcatenation(Model1) with hidden size {hidden_size}')
train_model(model1, train_loader, val_loader, criterion, optimizer1, num_epochs=10)
print(f'\nTraining LSTMWithAspectAttention(Model2) with hidden size {hidden_size}')
train_model(model2, train_loader, val_loader, criterion, optimizer2, num_epochs=10)
print(f'\nTraining LSTMWithAspectFusionPOS(Model3) with hidden size {hidden_size}')
train_model(model3, train_loader, val_loader, criterion, optimizer3, num_epochs=10, pos_tensor_required=True)


Training LSTMWithAspectConcatenation(Model1) with hidden size 64
Epoch 1/10, Train Loss: 0.9474, Val Loss: 0.9022
Epoch 2/10, Train Loss: 0.9229, Val Loss: 0.9024
Epoch 3/10, Train Loss: 0.9214, Val Loss: 0.9061
Epoch 4/10, Train Loss: 0.9045, Val Loss: 0.8680
Epoch 5/10, Train Loss: 0.8406, Val Loss: 0.8485
Epoch 6/10, Train Loss: 0.7782, Val Loss: 0.8329
Epoch 7/10, Train Loss: 0.7313, Val Loss: 0.8311
Epoch 8/10, Train Loss: 0.6879, Val Loss: 0.8461
Epoch 9/10, Train Loss: 0.6581, Val Loss: 0.9014
Epoch 10/10, Train Loss: 0.6320, Val Loss: 0.8839

Training LSTMWithAspectAttention(Model2) with hidden size 64
Epoch 1/10, Train Loss: 1.0645, Val Loss: 1.0160
Epoch 2/10, Train Loss: 1.0050, Val Loss: 0.9979
Epoch 3/10, Train Loss: 0.9599, Val Loss: 0.9958
Epoch 4/10, Train Loss: 0.9165, Val Loss: 0.9922
Epoch 5/10, Train Loss: 0.8764, Val Loss: 1.0111
Epoch 6/10, Train Loss: 0.8413, Val Loss: 1.0321
Epoch 7/10, Train Loss: 0.8128, Val Loss: 1.0818
Epoch 8/10, Train Loss: 0.7921, Val Lo