In [29]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from collections import Counter
import string
import re
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split

# Check if CUDA is available
is_cuda = torch.cuda.is_available()

# If a GPU is available, set the device to GPU, otherwise, use CPU
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

# Load your dataset
df = pd.read_csv("cleaned_village-hotel-changi-by-far-east-hospitality.csv")

# Filter out rows with NaN values in the rating column
valid_df = df.dropna(subset=['label'])

# Define a function to assign labels based on rating values
def assign_labels(rating):
    if rating == "Positive":
        return 1  # Positive
    elif rating == "Negative":
        return -1  # Negative
    else:
        return 0  # Neutral

# Apply the function to create a new 'sentiment' column
valid_df['sentiment'] = valid_df['label'].apply(assign_labels)

# Split the data into training and test sets
X = valid_df['combined_review'].values
y = valid_df['sentiment'].values

# Tokenization and Preprocessing (modify as needed)
def preprocess_string(s):
    s = re.sub(r"[^\w\s]", '', s)
    s = re.sub(r"\s+", ' ', s)
    s = re.sub(r"\d", '', s)
    return s

def tokenize(x_data):
    word_list = []
    for sent in x_data:
        for word in sent.lower().split():
            word = preprocess_string(word)
            if word != '':
                word_list.append(word)

    corpus = Counter(word_list)
    corpus_ = sorted(corpus, key=corpus.get, reverse=True)[:1000]
    onehot_dict = {w: i + 1 for i, w in enumerate(corpus_)}

    final_list = []
    for sent in x_data:
        final_list.append([onehot_dict[preprocess_string(word)] for word in sent.lower().split()
                           if preprocess_string(word) in onehot_dict.keys()])

    return np.array(final_list, dtype=object), onehot_dict

x_data, vocab = tokenize(X)

# Hyperparameters (modify as needed)
no_layers = 2
vocab_size = len(vocab) + 1  # Extra 1 for padding
embedding_dim = 64
output_dim = 1
hidden_dim = 256
lr = 0.001
batch_size = 50

# Padding
def padding_(sentences, seq_len):
    features = np.zeros((len(sentences), seq_len), dtype=int)
    for ii, review in enumerate(sentences):
        if len(review) != 0:
            features[ii, -len(review):] = np.array(review)[:seq_len]
    return features

x_data_pad = padding_(x_data, 500)
y_data = y

# Create Tensor datasets
data = TensorDataset(torch.from_numpy(x_data_pad), torch.from_numpy(y_data))

# Split the data into training and test sets
train_size = int(0.8 * len(data))
test_size = len(data) - train_size
train_data, test_data = torch.utils.data.random_split(data, [train_size, test_size])

# Dataloaders
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

# Model structure
class SentimentRNN(nn.Module):
    def __init__(self, no_layers, vocab_size, hidden_dim, embedding_dim):
        super(SentimentRNN, self).__init__()

        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.no_layers = no_layers
        self.vocab_size = vocab_size

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=self.hidden_dim, num_layers=no_layers, batch_first=True)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(self.hidden_dim, output_dim)
        self.sig = nn.Sigmoid()

    def forward(self, x, hidden):
        batch_size = x.size(0)  # Calculate the actual batch size
        # Initialize the hidden state based on the current batch size
        h0 = torch.zeros(self.no_layers, batch_size, self.hidden_dim).to(device)
        c0 = torch.zeros(self.no_layers, batch_size, self.hidden_dim).to(device)
        hidden = (h0, c0)

        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        out = self.dropout(lstm_out)
        out = self.fc(out)
        sig_out = self.sig(out)
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1]
        return sig_out, hidden

    def init_hidden(self, batch_size):
        h0 = torch.zeros((self.no_layers, batch_size, self.hidden_dim)).to(device)
        c0 = torch.zeros((self.no_layers, batch_size, self.hidden_dim)).to(device)
        hidden = (h0, c0)
        return hidden

# Model and optimizer
model = SentimentRNN(no_layers, vocab_size, hidden_dim, embedding_dim)
model.to(device)

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

# Function to predict accuracy
def acc(pred, label):
    pred = torch.round(pred.squeeze())
    return torch.sum(pred == label.squeeze()).item()

# Training and Evaluation
clip = 5
epochs = 20
valid_loss_min = np.Inf

epoch_tr_loss, epoch_vl_loss = [], []
epoch_tr_acc, epoch_vl_acc = [], []

for epoch in range(epochs):
    train_losses = []
    train_acc = 0.0
    model.train()
    h = model.init_hidden(batch_size)
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        h = tuple([each.data for each in h])
        model.zero_grad()
        output, h = model(inputs, h)
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        train_losses.append(loss.item())
        accuracy = acc(output, labels)
        train_acc += accuracy
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

    model.eval()
    val_h = model.init_hidden(batch_size)
    val_losses = []
    val_acc = 0.0
    for inputs, labels in test_loader:
        val_h = tuple([each.data for each in val_h])
        inputs, labels = inputs.to(device), labels.to(device)
        output, val_h = model(inputs, val_h)
        val_loss = criterion(output.squeeze(), labels.float())
        val_losses.append(val_loss.item())
        accuracy = acc(output, labels)
        val_acc += accuracy

    epoch_train_loss = np.mean(train_losses)
    epoch_val_loss = np.mean(val_losses)
    epoch_train_acc = train_acc / len(train_loader.dataset)
    epoch_val_acc = val_acc / len(test_loader.dataset)
    epoch_tr_loss.append(epoch_train_loss)
    epoch_vl_loss.append(epoch_val_loss)
    epoch_tr_acc.append(epoch_train_acc)
    epoch_vl_acc.append(epoch_val_acc)
    print(f'Epoch {epoch + 1}')
    print(f'train_loss: {epoch_train_loss} val_loss: {epoch_val_loss}')
    print(f'train_accuracy: {epoch_train_acc * 100} val_accuracy: {epoch_val_acc * 100}')
    print(25 * '==')


GPU not available, CPU used


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_df['sentiment'] = valid_df['label'].apply(assign_labels)


Epoch 1
train_loss: 0.6145607233047485 val_loss: 0.7047645449638367
train_accuracy: 82.01058201058201 val_accuracy: 80.0
Epoch 2
train_loss: 0.5367457158863544 val_loss: 0.557709738612175
train_accuracy: 82.27513227513228 val_accuracy: 80.0
Epoch 3
train_loss: 0.4736872501671314 val_loss: 0.8566180169582367
train_accuracy: 82.27513227513228 val_accuracy: 80.0
Epoch 4
train_loss: 0.46337663009762764 val_loss: 0.5234310477972031
train_accuracy: 81.74603174603175 val_accuracy: 82.10526315789474
Epoch 5
train_loss: 0.34272936172783375 val_loss: 0.5522213876247406
train_accuracy: 79.36507936507937 val_accuracy: 81.05263157894737
Epoch 6
train_loss: 0.28280456084758043 val_loss: 0.6185223460197449
train_accuracy: 75.92592592592592 val_accuracy: 82.10526315789474
Epoch 7
train_loss: 0.20807126304134727 val_loss: 0.5651834607124329
train_accuracy: 82.8042328042328 val_accuracy: 74.73684210526315
Epoch 8
train_loss: 0.03989373636431992 val_loss: 0.7888500988483429
train_accuracy: 83.59788359788

In [18]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from collections import Counter
import string
import re
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from gensim import corpora, models
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from gensim import corpora
from gensim.models import LdaModel

# Check if CUDA is available
is_cuda = torch.cuda.is_available()

# If a GPU is available, set the device to GPU, otherwise, use CPU
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

# Load your dataset
df = pd.read_csv("cleaned_village-hotel-changi-by-far-east-hospitality.csv")

# Filter out rows with NaN values in the rating column
valid_df = df.dropna(subset=['label'])

# Define a function to assign labels based on rating values
def assign_labels(rating):
    if rating == "Positive":
        return 1  # Positive
    elif rating == "Negative":
        return -1  # Negative
    else:
        return 0  # Neutral

# Apply the function to create a new 'sentiment' column
valid_df['sentiment'] = valid_df['label'].apply(assign_labels)

# Split the data into training and test sets
X = valid_df['cleaned_review'].values
y = valid_df['sentiment'].values

# Tokenization and Preprocessing (modify as needed)
def preprocess_string(s):
    s = re.sub(r"[^\w\s]", '', s)
    s = re.sub(r"\s+", ' ', s)
    s = re.sub(r"\d", '', s)
    return s

def tokenize(x_data):
    word_list = []
    for sent in x_data:
        for word in sent.lower().split():
            word = preprocess_string(word)
            if word != '':
                word_list.append(word)

    corpus = Counter(word_list)
    corpus_ = sorted(corpus, key=corpus.get, reverse=True)[:1000]
    onehot_dict = {w: i + 1 for i, w in enumerate(corpus_)}

    final_list = []
    for sent in x_data:
        final_list.append([onehot_dict[preprocess_string(word)] for word in sent.lower().split()
                           if preprocess_string(word) in onehot_dict.keys()])

    return np.array(final_list, dtype=object), onehot_dict

x_data, vocab = tokenize(X)

# Function to train the LDA model
def train_lda(x_data, num_topics=5):
    processed_data = [' '.join(map(str, doc)) for doc in x_data]  # Convert tokenized data to strings
    texts = [doc.split() for doc in processed_data]
    
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary)
    return lda_model

lda_model = train_lda(x_data, num_topics=5)

# Function to extract LDA features
def extract_lda_features(lda_model, x_data, num_topics):
    lda_features = []
    for doc in x_data:
        doc_str = ' '.join(map(str, doc))
        bow = lda_model.id2word.doc2bow(doc_str.split())
        topic_distribution = lda_model.get_document_topics(bow)
        topic_distribution = [score for _, score in topic_distribution]

        # Pad the topic distribution to have a fixed length (num_topics)
        topic_distribution += [0.0] * (num_topics - len(topic_distribution))

        lda_features.append(topic_distribution)

    return np.array(lda_features)

# Usage: Pass the 'num_topics' as an argument
lda_features = extract_lda_features(lda_model, x_data, num_topics=5)

# Hyperparameters (modify as needed)
no_layers = 2
vocab_size = len(vocab) + 1  # Extra 1 for padding
embedding_dim = 64
output_dim = 1
hidden_dim = 256
lr = 0.001
batch_size = 50

# Padding
def padding_(sentences, seq_len):
    features = np.zeros((len(sentences), seq_len), dtype=int)
    for ii, review in enumerate(sentences):
        if len(review) != 0:
            features[ii, -len(review):] = np.array(review)[:seq_len]
    return features

x_data_pad = padding_(x_data, 500)
lda_data_pad = padding_(lda_features, 5)  # Assuming LDA vectors have a length of 5

# Combine the padded LDA data and tokenized text data
combined_data = [list(lda_data_pad[i]) + list(x_data_pad[i]) for i in range(len(lda_data_pad))]

# Create Tensor datasets
y_data = y
data = TensorDataset(torch.from_numpy(np.array(combined_data)), torch.from_numpy(y_data))

# Split the data into training and test sets
train_size = int(0.8 * len(data))
test_size = len(data) - train_size
train_data, test_data = torch.utils.data.random_split(data, [train_size, test_size])

# Dataloaders
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

class SentimentRNN(nn.Module):
    def __init__(self, no_layers, input_dim, hidden_dim, embedding_dim):
        super(SentimentRNN, self).__init__()
        self.output_dim = 1
        self.hidden_dim = hidden_dim
        self.no_layers = no_layers
        self.vocab_size = input_dim

        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=self.hidden_dim, num_layers=no_layers, batch_first=True)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(self.hidden_dim, self.output_dim)
        self.sig = nn.Sigmoid()

    def forward(self, x):
        batch_size = x.size(0)  # Calculate the actual batch size
        h0 = torch.zeros(self.no_layers, batch_size, self.hidden_dim).to(device)
        c0 = torch.zeros(self.no_layers, batch_size, self.hidden_dim).to(device)
        hidden = (h0, c0)

        embeds = self.embedding(x)
        lstm_out, _ = self.lstm(embeds, hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        out = self.dropout(lstm_out)
        out = self.fc(out)
        sig_out = self.sig(out)
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1]
        return sig_out

model = SentimentRNN(no_layers, vocab_size, hidden_dim, embedding_dim)
model.to(device)


criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

# Function to predict accuracy
def acc(pred, label):
    pred = torch.round(pred.squeeze())
    return torch.sum(pred == label.squeeze()).item()

# Training and Evaluation
clip = 5
epochs = 6
valid_loss_min = np.Inf

epoch_tr_loss, epoch_vl_loss = [], []
epoch_tr_acc, epoch_vl_acc = [], []

for epoch in range(epochs):
    train_losses = []
    train_acc = 0.0
    model.train()
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        output = model(inputs)
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        train_losses.append(loss.item())
        accuracy = acc(output, labels)
        train_acc += accuracy
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

    model.eval()
    val_losses = []
    val_acc = 0.0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            output = model(inputs)
            val_loss = criterion(output.squeeze(), labels.float())
            val_losses.append(val_loss.item())
            accuracy = acc(output, labels)
            val_acc += accuracy

    epoch_train_loss = np.mean(train_losses)
    epoch_val_loss = np.mean(val_losses)
    epoch_train_acc = train_acc / len(train_loader.dataset)
    epoch_val_acc = val_acc / len(test_loader.dataset)
    epoch_tr_loss.append(epoch_train_loss)
    epoch_vl_loss.append(epoch_val_loss)
    epoch_tr_acc.append(epoch_train_acc)
    epoch_vl_acc.append(epoch_val_acc)
    print(f'Epoch {epoch + 1}')
    print(f'train_loss: {epoch_train_loss} val_loss: {epoch_val_loss}')
    print(f'train_accuracy: {epoch_train_acc * 100} val_accuracy: {epoch_val_acc * 100}')
    print(25 * '==')


GPU not available, CPU used


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_df['sentiment'] = valid_df['label'].apply(assign_labels)


Epoch 1
train_loss: 0.6493656486272812 val_loss: 0.4715089201927185
train_accuracy: 73.28042328042328 val_accuracy: 85.26315789473684
Epoch 2
train_loss: 0.5472150258719921 val_loss: 0.5086666643619537
train_accuracy: 80.95238095238095 val_accuracy: 85.26315789473684
Epoch 3
train_loss: 0.47524965554475784 val_loss: 0.4235513359308243
train_accuracy: 81.48148148148148 val_accuracy: 84.21052631578947
Epoch 4
train_loss: 0.5058940704911947 val_loss: 0.49164576828479767
train_accuracy: 80.15873015873017 val_accuracy: 80.0
Epoch 5
train_loss: 0.3480592295527458 val_loss: 0.4467100650072098
train_accuracy: 78.83597883597884 val_accuracy: 80.0
Epoch 6
train_loss: 0.2473376113921404 val_loss: 0.5056129395961761
train_accuracy: 78.83597883597884 val_accuracy: 75.78947368421053
