In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.nn.utils.rnn import pad_sequence
from gensim.models import Word2Vec


In [10]:
def preprocess_data(data_path):
    
    df = pd.read_csv(data_path)

    df.dropna(inplace=True)

    reviews = df['content'].values
    labels = df['score'].values

    # Encode the labels
    label_encoder = LabelEncoder()
    labels = label_encoder.fit_transform(labels)

    # Split the data into training and validation sets
    train_reviews, val_reviews, train_labels, val_labels = train_test_split(
        reviews, labels, test_size=0.2, random_state=42
    )

    return train_reviews, val_reviews, train_labels, val_labels, label_encoder


In [11]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        output, (hidden, cell) = self.lstm(embedded)
        hidden = torch.squeeze(hidden, 0)
        output = self.fc(hidden)
        return output

In [12]:
class AppReviewsDataset(Dataset):
    def __init__(self, reviews, labels, tokenizer, max_length):
        self.reviews = reviews
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        review = self.reviews[idx]
        label = self.labels[idx]
        encoded_review = self.tokenizer(review)
        padded_review = self.pad_sequence(encoded_review)
        return padded_review, label

    def pad_sequence(self, sequence):
        if len(sequence) < self.max_length:
            sequence += [0] * (self.max_length - len(sequence))
        else:
            sequence = sequence[:self.max_length]
        return torch.tensor(sequence)


In [1]:
import gensim.downloader as api

# Download the pre-trained Word2Vec model
word2vec_model = api.load("word2vec-google-news-300")


def tokenize_review(review):
    # Split the review into words
    tokens = review.split()
    # Get the word embeddings for each token
    embeddings = [word2vec_model.wv[token] for token in tokens if token in word2vec_model.wv]
    # Convert the embeddings to a tensor
    return torch.tensor(embeddings)
    

In [None]:
vocab_size = 10000  
embedding_dim = 100  
hidden_dim = 128  
output_dim = 5  
batch_size = 32  
learning_rate = 0.001  
num_epochs = 10  
max_length = 100  

In [None]:
train_reviews, val_reviews, train_labels, val_labels, label_encoder = preprocess_data('review_data.csv')

In [3]:
tokenizer = tokenize_review

# Create the dataset objects
train_dataset = AppReviewsDataset(train_reviews, train_labels, tokenizer, max_length)
val_dataset = AppReviewsDataset(val_reviews, val_labels, tokenizer, max_length)

# Create the data loaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


In [15]:
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [16]:
# Create the LSTM model
model = LSTMModel(vocab_size, embedding_dim, hidden_dim, output_dim).to(device)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [6]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    total_correct = 0
    total_samples = 0

    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Calculate accuracy
        _, predicted = torch.max(outputs.data, 1)
        total_samples += labels.size(0)
        total_correct += (predicted == labels).sum().item()

        total_loss += loss.item()

    train_accuracy = total_correct / total_samples
    train_loss = total_loss / len(train_loader)

    # Validation loop
    model.eval()
    val_loss = 0
    val_correct = 0
    val_samples = 0

    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, labels)

            _, predicted = torch.max(outputs.data, 1)
            val_samples += labels.size(0)
            val_correct += (predicted == labels).sum().item()

            val_loss += loss.item()

    val_accuracy = val_correct / val_samples
    val_loss = val_loss / len(val_loader)

    print(f'Epoch [{epoch+1}/{num_epochs}], '
          f'Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, '
          f'Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}')