In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'amazon-fine-food-reviews:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F18%2F2157%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240721%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240721T093332Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3Da776bb3997a3f5675f0bb0625bb3fe0b0ea9d892791a37cb0f6965e2e69294d54872a5bf238e8da961c25bb342032de647d651ee25a946854738e7088eba719292dfa661f8f0175fccfe10d539968f57bc133d79690e4d2e7b6cb693975e0df2d923fcff17c435dd1be175b143433b3338b8fa1cf269742922de2f4b3cd097fbccb66cb8ec4e7fe521fb2e2f7d577a5d33b254a464313be6a2f572d5978b3096d0f5316dfc2c83515fdf3f36f94c5ca5ed5be56bcc429642137fb260af976f9b63d2b007598eb0497acc83d0d8c55b358077cb2b97fb6da522c08c4081258f21c2dc0762d9a7bb1ac8f0046782c348f4655af4694e366d5f9c8ea5ddfca661e3'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading amazon-fine-food-reviews, 253873708 bytes compressed
Downloaded and uncompressed: amazon-fine-food-reviews
Data source import complete.


In [2]:
import pandas as pd
import re
import nltk
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [3]:
df=pd.read_csv("/kaggle/input/amazon-fine-food-reviews/Reviews.csv")
df

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...
...,...,...,...,...,...,...,...,...,...,...
568449,568450,B001EO7N10,A28KG5XORO54AY,Lettie D. Carter,0,0,5,1299628800,Will not do without,Great for sesame chicken..this is a good if no...
568450,568451,B003S1WTCU,A3I8AFVPEE8KI5,R. Sawyer,0,0,2,1331251200,disappointed,I'm disappointed with the flavor. The chocolat...
568451,568452,B004I613EE,A121AA1GQV751Z,"pksd ""pk_007""",2,2,5,1329782400,Perfect for our maltipoo,"These stars are small, so you can give 10-15 o..."
568452,568453,B004I613EE,A3IBEVCTXKNOH,"Kathy A. Welch ""katwel""",1,1,5,1331596800,Favorite Training and reward treat,These are the BEST treats for training and rew...


In [4]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [8]:
df = df.sample(frac=0.2, random_state=42)
df

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
323275,323276,B000OQ2DL4,AZGVG04V4ROGS,Tamatha Williams,4,4,5,1250553600,Calm magnesium suppliment,"An outstanding product, excellent value and th..."
456796,456797,B0009ETA6W,A23Z8OUW18498V,Angela T,3,3,1,1332892800,Didn't Work,"Too bad, they didn't work for me; I took 3 pil..."
126739,126740,B003IHO8LE,A3FH37O1JWFKR2,Quang Ngo,2,3,4,1314921600,"Didn't know exactly what to expect, turned out...",The first reason why I had even purchased this...
40817,40818,B001IZEJ76,A16BCJPTH1VSMU,pdolan477208,0,0,4,1344556800,Tasty and Fast,Product tastes great and is in good shape. Onl...
309305,309306,B001IA3RV0,A288XUYT5OXPBQ,"Omar Diab ""Sebastian""",0,0,5,1290556800,Awesome jerky!,"This jerky is just great. It was expensive, b..."
...,...,...,...,...,...,...,...,...,...,...
442919,442920,B0006345PW,A3IE3TQ79W0YOR,Ryan Johnson,9,13,1,1280793600,Please avoid!!,Please do your research on pet foods and what ...
410291,410292,B000084EZ4,A308U387DKC1LR,Harley Miller,2,2,5,1297296000,Great!,Switched over my British Shorthair over to Wel...
309092,309093,B000I6NO0Y,A11E1CWGQKJ7QE,J. Cole,1,1,5,1337644800,Best Tea Ever!,This is the best tea ever. It is a smaller py...
437903,437904,B000YPIL2K,A329U9VZ7EH9SH,OnlineShopperFan,2,2,5,1264896000,Good grits!,Had this delivered to me in 2 days flat with S...


In [9]:
def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    words = text.split()  # Tokenize the text
    words = [stemmer.stem(word) for word in words if word not in stop_words]  # Remove stop words and stem
    return ' '.join(words)


In [10]:
df['Text'] = df['Text'].apply(preprocess_text)

In [11]:
df['Score'] = df['Score'].apply(lambda x: 1 if x > 3 else 0)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(df['Text'], df['Score'], test_size=0.2, random_state=42)

In [13]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        inputs = self.tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length)
        inputs = {key: val.squeeze(0) for key, val in inputs.items()}
        return inputs, torch.tensor(label, dtype=torch.float)

In [14]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [15]:
max_length = 100  # Maximum length of the sequences
train_dataset = TextDataset(X_train.tolist(), y_train.tolist(), tokenizer, max_length)
test_dataset = TextDataset(X_test.tolist(), y_test.tolist(), tokenizer, max_length)

In [16]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=32, num_workers=4)



In [17]:
class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, n_layers, dropout):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(input_dim, hidden_dim)
        self.lstm = nn.LSTM(hidden_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        embedded = self.dropout(self.embedding(text))
        lstm_out, (hidden, cell) = self.lstm(embedded)
        output = self.fc(hidden[-1])
        return output

In [18]:
class GRUClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, n_layers, dropout):
        super(GRUClassifier, self).__init__()
        self.embedding = nn.Embedding(input_dim, hidden_dim)
        self.gru = nn.GRU(hidden_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        embedded = self.dropout(self.embedding(text))
        gru_out, hidden = self.gru(embedded)
        output = self.fc(hidden[-1])
        return output

In [19]:
def train_model(model, train_loader, criterion, optimizer, n_epochs):
    for epoch in range(n_epochs):
        model.train()
        total_loss = 0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs['input_ids'])
            # Reshape labels to match the output shape [batch_size, 1]
            labels = labels.view(-1, 1).float()
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}')

In [20]:
def evaluate_model(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs['input_ids'])
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.numpy())
            all_labels.extend(labels.numpy())
    print(classification_report(all_labels, all_preds))

In [21]:
input_dim = len(tokenizer)
hidden_dim = 128
output_dim = 1
n_layers = 2
dropout = 0.5
n_epochs = 5


lstm_model = LSTMClassifier(input_dim, hidden_dim, output_dim, n_layers, dropout)
gru_model = GRUClassifier(input_dim, hidden_dim, output_dim, n_layers, dropout)

In [22]:
criterion = nn.BCEWithLogitsLoss()
lstm_optimizer = optim.Adam(lstm_model.parameters())
gru_optimizer = optim.Adam(gru_model.parameters())

In [23]:
train_model(lstm_model, train_loader, criterion, lstm_optimizer, n_epochs)



Epoch 1, Loss: 0.529472876988312
Epoch 2, Loss: 0.5257448645174398
Epoch 3, Loss: 0.517093826000217
Epoch 4, Loss: 0.48921596616349444
Epoch 5, Loss: 0.5204812415035831


In [24]:
evaluate_model(lstm_model, test_loader)

              precision    recall  f1-score   support

         0.0       0.22      1.00      0.36       995
         1.0       0.00      0.00      0.00      3553

    accuracy                           0.22      4548
   macro avg       0.11      0.50      0.18      4548
weighted avg       0.05      0.22      0.08      4548



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
train_model(gru_model, train_loader, criterion, gru_optimizer, n_epochs)



Epoch 1, Loss: 0.5277196461161326
Epoch 2, Loss: 0.44283742925089986
Epoch 3, Loss: 0.3716870669426105
Epoch 4, Loss: 0.3358906517050807
Epoch 5, Loss: 0.3147575352439143


In [26]:
evaluate_model(gru_model, test_loader)

              precision    recall  f1-score   support

         0.0       0.22      1.00      0.36       995
         1.0       0.00      0.00      0.00      3553

    accuracy                           0.22      4548
   macro avg       0.11      0.50      0.18      4548
weighted avg       0.05      0.22      0.08      4548



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [37]:
sample_reviews = [
    "This product is fantastic! I love it.",
    "Terrible product. Would not recommend.",
    "It's okay, not the best but not the worst.",
    "Absolutely wonderful! Exceeded my expectations.",
    "I hated this item. Complete waste of money."
]

# Preprocess reviews
sample_reviews_cleaned = [preprocess_text(review) for review in sample_reviews]

# Create dataset and dataloader
sample_dataset = TextDataset(sample_reviews_cleaned, [0]*len(sample_reviews), tokenizer, max_length)
sample_loader = DataLoader(sample_dataset, batch_size=1)

# Define dummy models for demonstration
class DummyModel(torch.nn.Module):
    def __init__(self):
        super(DummyModel, self).__init__()
        self.linear = torch.nn.Linear(max_length, 2)

    def forward(self, input_ids):
        return self.linear(input_ids.float()) # Convert input_ids to float

lstm_model = DummyModel()
gru_model = DummyModel()


print("Sample Predictions (LSTM):")
lstm_model.eval()
with torch.no_grad():
    for i, (inputs, _) in enumerate(sample_loader):
        outputs = lstm_model(inputs['input_ids'])
        preds = torch.argmax(outputs, dim=1)
        review = sample_reviews[i]
        sentiment = preds.item()
        print(f"Review: {review}\nPredicted Sentiment: {'Positive' if sentiment == 1 else 'Negative'}\n")

print("Sample Predictions (GRU):")
gru_model.eval()
with torch.no_grad():
    for i, (inputs, _) in enumerate(sample_loader):
        outputs = gru_model(inputs['input_ids'])
        preds = torch.argmax(outputs, dim=1)
        review = sample_reviews[i]
        sentiment = preds.item()
        print(f"Review: {review}\nPredicted Sentiment: {'Positive' if sentiment == 1 else 'Negative'}\n")

Sample Predictions (LSTM):
Review: This product is fantastic! I love it.
Predicted Sentiment: Positive

Review: Terrible product. Would not recommend.
Predicted Sentiment: Positive

Review: It's okay, not the best but not the worst.
Predicted Sentiment: Positive

Review: Absolutely wonderful! Exceeded my expectations.
Predicted Sentiment: Positive

Review: I hated this item. Complete waste of money.
Predicted Sentiment: Positive

Sample Predictions (GRU):
Review: This product is fantastic! I love it.
Predicted Sentiment: Negative

Review: Terrible product. Would not recommend.
Predicted Sentiment: Negative

Review: It's okay, not the best but not the worst.
Predicted Sentiment: Negative

Review: Absolutely wonderful! Exceeded my expectations.
Predicted Sentiment: Negative

Review: I hated this item. Complete waste of money.
Predicted Sentiment: Negative

