In [117]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import numpy as np
import nltk
from transformers import BertTokenizer, BertModel
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/alexy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [118]:
# Sample data
texts = ["This is a positive sentence.", "Negative sentiment here.", "Another positive example.", "Not a good experience."]
labels = [1, 0, 1, 0]  # 1 for positive, 0 for negative
texts_tokenized = []
for text in texts:
    tokens = nltk.word_tokenize(text)
    texts_tokenized.append(tokens)
print(texts_tokenized)
# sentences = [' '.join(tokens) for tokens in texts_tokenized]
# print(sentences)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
encoded_inputs = tokenizer(texts_tokenized, return_tensors='pt', padding=True, truncation=True)
print(encoded_inputs)


# Forward pass to get BERT embeddings
with torch.no_grad():
    outputs = model(**encoded_inputs)

# Extract the embeddings for the [CLS] token (representing the entire sequence)
bert_embeddings = outputs.last_hidden_state[:, 0, :].numpy()
print(bert_embeddings)



# # Split the data into training and testing sets
# texts_train, texts_test, labels_train, labels_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# # Tokenize and vectorize the text data
# vectorizer = CountVectorizer()
# X_train = vectorizer.fit_transform(texts_tokenized).toarray()
# X_test = vectorizer.transform(texts_test).toarray()

[['This', 'is', 'a', 'positive', 'sentence', '.'], ['Negative', 'sentiment', 'here', '.'], ['Another', 'positive', 'example', '.'], ['Not', 'a', 'good', 'experience', '.']]


ValueError: too many values to unpack (expected 2)

In [None]:
# LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        _, (h_n, _) = self.lstm(x)
        x = self.fc(h_n[-1, :, :])
        x = self.sigmoid(x)
        return x

# Optionally, you can use TF-IDF to represent the BERT embeddings
# vectorizer = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS)
# tfidf_embeddings = vectorizer.fit_transform(sentences).toarray()

# Combine BERT embeddings and TF-IDF embeddings
combined_embeddings = np.concatenate([bert_embeddings], axis=1)

# Convert to PyTorch tensor
combined_embeddings = torch.tensor(combined_embeddings, dtype=torch.float32)
print(combined_embeddings)

# Instantiate the LSTM model
input_size = combined_embeddings.shape[1]
hidden_size = 64
output_size = 1
lstm_model = LSTMModel(input_size, hidden_size, output_size)

# Forward pass through the LSTM model
with torch.no_grad():
    lstm_outputs = lstm_model(combined_embeddings.unsqueeze(0))  # Unsqueezing to add batch dimension

print("LSTM Model Output:")
print(lstm_outputs.item())  # Print the predicted output

tensor([[-0.1128, -0.2216, -0.1149,  ..., -0.4784,  0.1028,  0.8216],
        [-0.0950,  0.0054, -0.5397,  ..., -0.1823,  0.4932,  0.3447],
        [-0.0960, -0.2762, -0.3418,  ..., -0.1201,  0.4011,  0.5749],
        [-0.3945, -0.1763, -0.2095,  ..., -0.3725,  0.2388,  0.4599]])
LSTM Model Output:
0.4680742025375366


In [None]:
texts_train


['Not a good experience.',
 'This is a positive sentence.',
 'Another positive example.']

In [None]:
X_train

tensor([[0, 0, 1, 1, 0, 1, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 1, 1, 1],
        [1, 1, 0, 0, 0, 0, 1, 0, 0]])

In [None]:
# Convert to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.long)
y_train = torch.tensor(labels_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.long)
y_test = torch.tensor(labels_test, dtype=torch.float32)

  X_train = torch.tensor(X_train, dtype=torch.long)
  X_test = torch.tensor(X_test, dtype=torch.long)


In [None]:
X_train

tensor([[0, 0, 1, 1, 0, 1, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 1, 1, 1],
        [1, 1, 0, 0, 0, 0, 1, 0, 0]])

In [None]:
y_train

tensor([0., 1., 1.])

In [None]:
import torch
import torch.nn as nn

# Assume a vocabulary size of 10 and embedding dimension of 16
embedding_layer = nn.EmbeddingBag(num_embeddings=10, embedding_dim=16, sparse=True)

# Input indices for three sequences
indices = torch.tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

# Input offsets indicating the start index of each sequence
offsets = torch.tensor([0, 3, 7, 10])

# Forward pass through nn.EmbeddingBag
output = embedding_layer(indices, offsets)
print(output)

tensor([[ 0.0342, -0.2594,  0.1962, -0.3074,  0.2669,  0.7561, -0.5733,  0.1650,
         -0.0077, -0.3989,  1.0017, -0.1417,  0.0359,  0.3365, -0.5682, -0.4657],
        [ 0.4523, -0.2164, -0.8530, -0.0618, -0.3579, -0.6868, -0.4623,  0.8977,
          0.6311, -0.1142,  0.8225,  0.5853, -0.1996, -1.1757, -0.7082,  0.2216],
        [-0.2351, -1.0484,  0.9080,  0.2761, -0.3338,  0.5125, -0.2036,  0.7210,
         -0.5958,  0.6286,  0.5356,  0.4384, -0.6335, -0.1539,  0.3250, -0.5446],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000]],
       grad_fn=<EmbeddingBagBackward0>)


In [None]:
# Define the LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMModel, self).__init__()
        self.embedding = nn.EmbeddingBag(input_size, embedding_dim=16, sparse=True)
        self.lstm = nn.LSTM(16, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        print(x)
        x = self.embedding(x)
        print(x)
        x, _ = self.lstm(x)
        x = x[:, -1, :]  # Take the output from the last time step
        x = self.fc(x)
        x = self.sigmoid(x)
        return x

In [None]:
# Instantiate the model, define the loss function and optimizer
input_size = X_train[0].shape[0]
model = LSTMModel(input_size=len(vectorizer.get_feature_names_out()), hidden_size=64, output_size=1)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

NotFittedError: Vocabulary not fitted or provided

In [None]:
# Training loop
epochs = 10
for epoch in range(epochs):
    optimizer.zero_grad()
    outputs = model(X_train)
    loss = criterion(outputs.squeeze(), y_train)
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 1 == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')


tensor([[0, 0, 1, 1, 0, 1, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 1, 1, 1],
        [1, 1, 0, 0, 0, 0, 1, 0, 0]])
tensor([[ 0.1194, -1.1045,  0.0045, -0.4782,  0.0090,  0.2128, -0.0097, -0.0811,
          0.1878,  0.7453, -0.7448,  0.4030,  0.4179,  0.2216, -1.6911, -0.6077],
        [ 0.2047, -0.9988, -0.0239, -0.2940, -0.0648,  0.2179,  0.2170,  0.0626,
          0.1829,  0.6213, -0.8103,  0.3868,  0.2803,  0.2267, -1.5653, -0.4719],
        [ 0.1194, -1.1045,  0.0045, -0.4782,  0.0090,  0.2128, -0.0097, -0.0811,
          0.1878,  0.7453, -0.7448,  0.4030,  0.4179,  0.2216, -1.6911, -0.6077]],
       grad_fn=<EmbeddingBagBackward0>)


IndexError: too many indices for tensor of dimension 2

In [None]:
# Evaluation
with torch.no_grad():
    model.eval()
    predictions = model(X_test).squeeze().numpy()
    binary_predictions = (predictions > 0.5).astype(int)
    accuracy = accuracy_score(labels_test, binary_predictions)
    print(f'Test Accuracy: {accuracy:.4f}')