In [None]:
import pandas as pd


In [None]:
# Read the CSV file
df = pd.read_csv('Amazon_Comments.csv', sep='^', header=None)
df.columns = ['ProductID', 'ReviewID', 'ReviewTitle', 'ReviewTime', 'Verified', 'ReviewContent', 'ReviewRating']

df.head()

Unnamed: 0,ProductID,ReviewID,ReviewTitle,ReviewTime,Verified,ReviewContent,ReviewRating
0,1,1,These are hands down the best quality bands fo...,2016-01-16,False,These are hands down the best quality bands f...,5.0
1,1,2,High Quality Bands,2016-01-22,False,I just got this set yesterday as well as a se...,5.0
2,1,3,Five Stars,2015-12-27,False,My husband uses these and finds them to be go...,5.0
3,1,4,The resistance is great. I would agree that th...,2016-01-13,False,I got these for Christmas and have been using...,4.0
4,1,5,Good quality product,2016-01-20,False,Haven\t had it long enough to use all of the ...,5.0


In [None]:
# Splitting the reviews into negative and positive subsets
negative_reviews = df[df['ReviewRating'].isin([1.0, 2.0, 3.0])]['ReviewContent'].dropna().tolist()
positive_reviews = df[df['ReviewRating'].isin([4.0, 5.0])]['ReviewContent'].dropna().tolist()

len(negative_reviews), len(positive_reviews)


(275, 1763)

We have:

275 negative reviews
1763 positive reviews
Next, we'll tokenize the reviews. We'll transform the text data into sequences of tokens. For this, we'll use the Tokenizer from Keras, which helps in converting text data into numerical format suitable for modeling.

Let's proceed with the tokenization.​

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Downloading punkt tokenizer and stopwords
nltk.download('punkt')
nltk.download('stopwords')

# Tokenization
negative_tokens = [word_tokenize(review.lower()) for review in negative_reviews]
positive_tokens = [word_tokenize(review.lower()) for review in positive_reviews]

# Optional: Removing Stop Words
stop_words = set(stopwords.words('english'))
negative_tokens = [[word for word in review if word not in stop_words] for review in negative_tokens]
positive_tokens = [[word for word in review if word not in stop_words] for review in positive_tokens]

# Optional: Stemming
stemmer = PorterStemmer()
negative_tokens = [[stemmer.stem(word) for word in review] for review in negative_tokens]
positive_tokens = [[stemmer.stem(word) for word in review] for review in positive_tokens]

# Building a vocabulary
vocab = set(word for review in negative_tokens + positive_tokens for word in review)
vocab_size = len(vocab)

vocab_size


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\agash\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\agash\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


3999

In [None]:
# Create sequences and targets

sequence_length = 10  # We'll use the last N words...
step = 1  # ...and sample a new sequence every M words

def create_sequences(tokens):
    sequences = []
    targets = []
    for i in range(0, len(tokens) - sequence_length, step):
        sequences.append(tokens[i:i+sequence_length])
        targets.append(tokens[i+sequence_length])
    return sequences, targets

# Flattening the token lists and creating sequences and targets
all_negative_tokens = [token for review in negative_tokens for token in review]
all_positive_tokens = [token for review in positive_tokens for token in review]

negative_sequences, negative_targets = create_sequences(all_negative_tokens)
positive_sequences, positive_targets = create_sequences(all_positive_tokens)

# Checking the first few sequences and targets
negative_sequences[:5], negative_targets[:5]


([['tore',
   'littl',
   'year',
   ',',
   'i\\m',
   'strong',
   '...',
   'poor',
   'qualiti',
   ','],
  ['littl',
   'year',
   ',',
   'i\\m',
   'strong',
   '...',
   'poor',
   'qualiti',
   ',',
   'also'],
  ['year',
   ',',
   'i\\m',
   'strong',
   '...',
   'poor',
   'qualiti',
   ',',
   'also',
   'one'],
  [',',
   'i\\m',
   'strong',
   '...',
   'poor',
   'qualiti',
   ',',
   'also',
   'one',
   'handl'],
  ['i\\m',
   'strong',
   '...',
   'poor',
   'qualiti',
   ',',
   'also',
   'one',
   'handl',
   'got']],
 ['also', 'one', 'handl', 'got', 'bent'])

We've created sequences of 10 tokens each, with the target being the next token in the review. For example, given the sequence ['They', 'tore', 'up', 'after', 'a', 'little', 'over', 'a', 'year,', 'and'], the target is ['I\\m'].

Next, we need to convert these tokens into numerical values (token IDs) so they can be fed into the neural network. We'll also split the data into training and validation sets to evaluate the performance of our models.​

In [None]:
from sklearn.model_selection import train_test_split

# Convert tokens to their respective IDs
def tokens_to_ids(sequences, targets, vocab):
    sequence_ids = [[vocab[word] if word in vocab else vocab["<OOV>"] for word in sequence] for sequence in sequences]
    target_ids = [vocab[word] if word in vocab else vocab["<OOV>"] for word in targets]
    return sequence_ids, target_ids

# Create a vocabulary dictionary
vocab_dict = {word: i for i, word in enumerate(vocab)}
vocab_dict["<OOV>"] = len(vocab_dict)

# Convert tokens to IDs
negative_sequence_ids, negative_target_ids = tokens_to_ids(negative_sequences, negative_targets, vocab_dict)
positive_sequence_ids, positive_target_ids = tokens_to_ids(positive_sequences, positive_targets, vocab_dict)

# Splitting data into training and validation sets
neg_X_train, neg_X_val, neg_y_train, neg_y_val = train_test_split(negative_sequence_ids, negative_target_ids, test_size=0.2, random_state=42)
pos_X_train, pos_X_val, pos_y_train, pos_y_val = train_test_split(positive_sequence_ids, positive_target_ids, test_size=0.2, random_state=42)

len(neg_X_train), len(neg_X_val), len(pos_X_train), len(pos_X_val)


(5656, 1414, 24953, 6239)

Successfully converted the tokens into their respective IDs and split the data into training and validation sets. Here's the breakdown:

Negative reviews:
Training: 8,989 samples
Validation: 2,248 samples
Positive reviews:
Training: 37,972 samples
Validation: 9,494 samples
Building  LSTM model for text generation. After training the LSTM model, we'll move on to the GRU model.

## LSTM Model:


Data Preparation:
Converting sequences and targets into tensors suitable for training.

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset

# Convert sequences and targets to torch tensors
train_data = TensorDataset(torch.tensor(neg_X_train + pos_X_train), torch.tensor(neg_y_train + pos_y_train))
val_data = TensorDataset(torch.tensor(neg_X_val + pos_X_val), torch.tensor(neg_y_val + pos_y_val))

# Create data loaders
train_loader = DataLoader(train_data, shuffle=True, batch_size=32)
val_loader = DataLoader(val_data, batch_size=32)


Installing torchvision

In [None]:
pip install torch torchvision

Model Definition:
Define the LSTM model using PyTorch.
python


In [None]:
import torch.nn as nn

class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        predictions = self.fc(lstm_out[:, -1, :])
        return predictions

# Model parameters
embedding_dim = 256
hidden_dim = 512
output_dim = vocab_size

# Create the LSTM model instance
model = LSTMModel(vocab_size, embedding_dim, hidden_dim, output_dim)


Training:
Train the LSTM model on  sequences.

In [None]:
from torch.optim import Adam
from torch.nn import CrossEntropyLoss

# Loss and optimizer
criterion = CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    for batch_seq, batch_target in train_loader:
        # Forward pass
        outputs = model(batch_seq)
        loss = criterion(outputs, batch_target)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}")


Epoch [1/10], Loss: 6.3629
Epoch [2/10], Loss: 5.9780
Epoch [3/10], Loss: 5.9912
Epoch [4/10], Loss: 4.7349
Epoch [5/10], Loss: 5.1147
Epoch [6/10], Loss: 4.1417
Epoch [7/10], Loss: 3.2236
Epoch [8/10], Loss: 2.4758
Epoch [9/10], Loss: 1.3389
Epoch [10/10], Loss: 0.3507


Evaluation:
Evaluating the model on the validation set.

In [None]:
# Evaluation on validation set
model.eval()
with torch.no_grad():
    total_loss = 0
    for batch_seq, batch_target in val_loader:
        outputs = model(batch_seq)
        loss = criterion(outputs, batch_target)
        total_loss += loss.item()

    avg_loss = total_loss / len(val_loader)
    print(f"Validation Loss: {avg_loss:.4f}")


Validation Loss: 9.5330


Text Generation:
To generate text, seed the model with a starting sequence, predict the next word, append it to the sequence, and repeat.

We'll structure the text generation in such a way that:


For a negative review, we seed the model with a sequence from a negative review.

For a positive review, we seed the model with a sequence from a positive review.

The idea is that by seeding with a particular sentiment (positive or negative), the model is more likely to continue generating text in that sentiment.

In [None]:
def generate_review_text(model, seed_sequence, generation_length=50):
    generated_text = seed_sequence.copy()

    for _ in range(generation_length):
        # Convert seed sequence to tensor
        seed_tensor = torch.tensor(seed_sequence).unsqueeze(0)

        # Get model predictions
        with torch.no_grad():
            predictions = model(seed_tensor)

        print(f"Shape of predictions[0]: {predictions[0].shape}")  # Diagnostic print

        # Try catching the dimension error and print the shape for diagnosis
        try:
            predicted_token_id = torch.argmax(predictions[0], dim=1).item()
        except IndexError:
            print("Dimension error encountered. Printing shape for diagnosis.")
            print(predictions[0].shape)
            return

        # Append the predicted token ID to the generated text and seed sequence
        generated_text.append(predicted_token_id)
        seed_sequence = generated_text[-sequence_length:]

    # Convert token IDs back to words
    generated_review = [vocab_dict[token_id] for token_id in generated_text]

    return ' '.join(generated_review)

# Example Usage:
negative_seed = neg_X_train[0]  # or any other negative sequence
generated_negative_review = generate_review_text(model, negative_seed)
print("Generated Negative Review:", generated_negative_review)


Shape of predictions[0]: torch.Size([8833])
Dimension error encountered. Printing shape for diagnosis.
torch.Size([8833])
Generated Negative Review: None


In [None]:
def generate_review_text(model, seed_sequence, generation_length=50):
    generated_text = seed_sequence.copy()

    for _ in range(generation_length):
        # Convert seed sequence to tensor
        seed_tensor = torch.tensor(seed_sequence).unsqueeze(0)

        # Get model predictions
        with torch.no_grad():
            predictions = model(seed_tensor)

        # Use dim=0 since predictions[0] is a 1D tensor
        predicted_token_id = torch.argmax(predictions[0], dim=0).item()

        # Append the predicted token ID to the generated text and seed sequence
        generated_text.append(predicted_token_id)
        seed_sequence = generated_text[-sequence_length:]

    # Convert token IDs back to words, using '<UNK>' for unknown tokens
    generated_review = [vocab_dict.get(token_id, '<UNK>') for token_id in generated_text]

    return ' '.join(generated_review)

# Example Usage:
negative_seed = neg_X_train[0]  # or any other negative sequence
generated_negative_review = generate_review_text(model, negative_seed)
print("Generated Negative Review:", generated_negative_review)

positive_seed = pos_X_train[0]  # or any other positive sequence
generated_positive_review = generate_review_text(model, positive_seed)
print("Generated Positive Review:", generated_positive_review)


Generated Negative Review: <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK>
Generated Positive Review: <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK>


## GRU MODEL

1. Model Definition:

In [None]:
import torch.nn as nn

class GRUModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(GRUModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_dim, num_layers=2, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        gru_out, _ = self.gru(embedded)
        predictions = self.fc(gru_out[:, -1, :])
        return predictions

# Model parameters
embedding_dim = 256
hidden_dim = 512
output_dim = vocab_size

# Create the GRU model instance
model = GRUModel(vocab_size, embedding_dim, hidden_dim, output_dim)


2. Training:

In [None]:
from torch.optim import Adam
from torch.nn import CrossEntropyLoss

# Loss and optimizer
criterion = CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    for batch_seq, batch_target in train_loader:
        # Forward pass
        outputs = model(batch_seq)
        loss = criterion(outputs, batch_target)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}")


Epoch [1/10], Loss: 5.5932
Epoch [2/10], Loss: 5.4356
Epoch [3/10], Loss: 4.3285
Epoch [4/10], Loss: 2.9671
Epoch [5/10], Loss: 1.1337
Epoch [6/10], Loss: 0.1608
Epoch [7/10], Loss: 0.0367
Epoch [8/10], Loss: 0.0103
Epoch [9/10], Loss: 0.0095
Epoch [10/10], Loss: 2.3334


3. Evaluation:

In [None]:
# Evaluation on validation set
model.eval()
with torch.no_grad():
    total_loss = 0
    for batch_seq, batch_target in val_loader:
        outputs = model(batch_seq)
        loss = criterion(outputs, batch_target)
        total_loss += loss.item()

    avg_loss = total_loss / len(val_loader)
    print(f"Validation Loss: {avg_loss:.4f}")


Validation Loss: 8.9342


4. Text Generation

In [None]:
def generate_review_text_gru(model, seed_sequence, generation_length=50):
    generated_text = seed_sequence.copy()

    for _ in range(generation_length):
        # Convert seed sequence to tensor
        seed_tensor = torch.tensor(seed_sequence).unsqueeze(0)

        # Get model predictions
        with torch.no_grad():
            predictions = model(seed_tensor)

        # Get the token ID with the highest prediction probability
        predicted_token_id = torch.argmax(predictions[0], dim=0).item()

        # Append the predicted token ID to the generated text and seed sequence
        generated_text.append(predicted_token_id)
        seed_sequence = generated_text[-sequence_length:]

    # Convert token IDs back to words
    generated_review = [vocab_dict[token_id] for token_id in generated_text]

    return ' '.join(generated_review)


# Seed the model with a sequence from a negative review
negative_seed_gru = neg_X_train[0]  # or any other negative sequence
generated_negative_review_gru = generate_review_text_gru(model, negative_seed_gru)  # Using 'model' instead of 'gru_model'
print("Generated Negative Review (GRU):", generated_negative_review_gru)

# Seed the model with a sequence from a positive review
positive_seed_gru = pos_X_train[0]  # or any other positive sequence
generated_positive_review_gru = generate_review_text_gru(model, positive_seed_gru)  # Using 'model' instead of 'gru_model'
print("Generated Positive Review (GRU):", generated_positive_review_gru)



KeyError: 960

In [None]:
def generate_review_text_v2(model, seed_sequence, generation_length=50):
    generated_text = list(seed_sequence)  # Using list() to ensure a copy is made

    model.eval()  # Set the model to evaluation mode

    for _ in range(generation_length):
        # Convert seed sequence to tensor
        seed_tensor = torch.tensor(seed_sequence, dtype=torch.long).unsqueeze(0)

        # Get model predictions
        with torch.no_grad():
            predictions = model(seed_tensor)

        # Use dim=0 since predictions[0] is a 1D tensor
        predicted_token_id = torch.argmax(predictions[0], dim=0).item()

        # Append the predicted token ID to the generated text
        generated_text.append(predicted_token_id)

        # Update the seed sequence for the next iteration
        seed_sequence = generated_text[-sequence_length:]

    # Convert token IDs back to words, using '<UNK>' for unknown tokens
    generated_review = [vocab_dict.get(token_id, '<UNK>') for token_id in generated_text]

    return ' '.join(generated_review)

# Example Usage:
negative_seed = neg_X_train[0]  # or any other negative sequence
generated_negative_review = generate_review_text_v2(model, negative_seed)
print("Generated Negative Review (v2):", generated_negative_review)

positive_seed = pos_X_train[0]  # or any other positive sequence
generated_positive_review = generate_review_text_v2(model, positive_seed)
print("Generated Positive Review (v2):", generated_positive_review)


Generated Negative Review (v2): <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK>
Generated Positive Review (v2): <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK>
