In [1]:
# Our tiny dataset (imagine these are movie reviews)
raw_reviews = [
    "I love this movie!",
    "This film is great.",
    "The acting was terrible.",
    "I hate this film.",
    "Great movie!",
    "It was good.",
]

# Step 1: Simple Tokenization (just splitting by space and removing basic punctuation)
tokenized_reviews = []
for review in raw_reviews:
    # Convert to lowercase and remove common punctuation for simplicity
    clean_review = review.lower().replace("!", "").replace(".", "").replace(",", "")
    tokens = clean_review.split()
    tokenized_reviews.append(tokens)

print("Tokenized Reviews:")
for tokens in tokenized_reviews:
    print(tokens)

# Step 2: Vocabulary Creation
# We'll use a set to easily get unique words
vocabulary_set = set()
for tokens in tokenized_reviews:
    for token in tokens:
        vocabulary_set.add(token)

# Add special tokens
special_tokens = ["<UNK>", "<PAD>"]
for token in special_tokens:
    vocabulary_set.add(token)

# Convert the set to a sorted list to ensure consistent ordering (important for assigning indices)
vocabulary = sorted(list(vocabulary_set))

# Create a mapping from word to index (word_to_idx)
word_to_idx = {word: idx for idx, word in enumerate(vocabulary)}

# Create a mapping from index to word (idx_to_word) for reverse lookup
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

print("\nVocabulary:")
print(vocabulary)
print(f"Vocabulary size: {len(vocabulary)}")

print("\nWord to Index Mapping (first few):")
# Print first 5 items to keep it concise
for word, idx in list(word_to_idx.items())[:5]:
    print(f"'{word}': {idx}")

print("\nIndex to Word Mapping (first few):")
# Print first 5 items
for idx, word in list(idx_to_word.items())[:5]:
    print(f"{idx}: '{word}'")

Tokenized Reviews:
['i', 'love', 'this', 'movie']
['this', 'film', 'is', 'great']
['the', 'acting', 'was', 'terrible']
['i', 'hate', 'this', 'film']
['great', 'movie']
['it', 'was', 'good']

Vocabulary:
['<PAD>', '<UNK>', 'acting', 'film', 'good', 'great', 'hate', 'i', 'is', 'it', 'love', 'movie', 'terrible', 'the', 'this', 'was']
Vocabulary size: 16

Word to Index Mapping (first few):
'<PAD>': 0
'<UNK>': 1
'acting': 2
'film': 3
'good': 4

Index to Word Mapping (first few):
0: '<PAD>'
1: '<UNK>'
2: 'acting'
3: 'film'
4: 'good'


In [3]:
import torch
# Assuming 'tokenized_reviews', 'word_to_idx', and 'idx_to_word' from your previous code

# Define a maximum sequence length (choose something reasonable for your data)
# For our very short sentences, let's pick 5.
max_sequence_length = 5
pad_idx = word_to_idx["<PAD>"] # Get the index for our padding token

numericalized_reviews = []
for tokens in tokenized_reviews:
    # 1. Numericalize: Convert words to indices
    # Use word_to_idx.get(token, word_to_idx["<UNK>"]) to handle potential unknown words
    numericalized_tokens = [word_to_idx.get(token, word_to_idx["<UNK>"]) for token in tokens]

    # 2. Pad or Truncate: Make all sequences max_sequence_length
    if len(numericalized_tokens) < max_sequence_length:
        # Pad: Add pad_idx until it reaches max_sequence_length
        padded_tokens = numericalized_tokens + [pad_idx] * (max_sequence_length - len(numericalized_tokens))
    else:
        # Truncate: If a review is longer than max_sequence_length, cut it short
        padded_tokens = numericalized_tokens[:max_sequence_length]

    numericalized_reviews.append(padded_tokens)

print("\nNumericalized and Padded Reviews:")
for review_indices in numericalized_reviews:
    print(review_indices)

# You can convert this to a PyTorch tensor now
numericalized_reviews_tensor = torch.tensor(numericalized_reviews, dtype=torch.long)
print("\nNumericalized Reviews as PyTorch Tensor:")
print(numericalized_reviews_tensor)
print("Tensor shape:", numericalized_reviews_tensor.shape)

# Let's also define some dummy labels for our reviews
# 1 for positive, 0 for negative
# Matches raw_reviews: "I love this movie!", "This film is great.", "The acting was terrible.", "I hate this film.", "Great movie!", "It was good.",
dummy_labels = torch.tensor([1, 1, 0, 0, 1, 1], dtype=torch.float).unsqueeze(1) # unsqueeze(1) makes it a column vector
print("\nDummy Labels Tensor:")
print(dummy_labels)
print("Labels shape:", dummy_labels.shape)


Numericalized and Padded Reviews:
[7, 10, 14, 11, 0]
[14, 3, 8, 5, 0]
[13, 2, 15, 12, 0]
[7, 6, 14, 3, 0]
[5, 11, 0, 0, 0]
[9, 15, 4, 0, 0]

Numericalized Reviews as PyTorch Tensor:
tensor([[ 7, 10, 14, 11,  0],
        [14,  3,  8,  5,  0],
        [13,  2, 15, 12,  0],
        [ 7,  6, 14,  3,  0],
        [ 5, 11,  0,  0,  0],
        [ 9, 15,  4,  0,  0]])
Tensor shape: torch.Size([6, 5])

Dummy Labels Tensor:
tensor([[1.],
        [1.],
        [0.],
        [0.],
        [1.],
        [1.]])
Labels shape: torch.Size([6, 1])


In [4]:
import torch
from torch.utils.data import Dataset, DataLoader

# Assuming numericalized_reviews_tensor and dummy_labels are available from the previous cell's output

class TextSentimentDataset(Dataset):
    def __init__(self, data_tensor, labels_tensor):
        # Store the preprocessed data and labels
        self.data = data_tensor
        self.labels = labels_tensor

    def __len__(self):
        # Return the total number of samples
        return len(self.data)

    def __getitem__(self, idx):
        # Return a single data sample (numericalized text and its label)
        return self.data[idx], self.labels[idx]

# Instantiate our custom dataset
sentiment_dataset = TextSentimentDataset(numericalized_reviews_tensor, dummy_labels)
print(f"Total samples in sentiment dataset: {len(sentiment_dataset)}")

# Create a DataLoader
# We'll use a small batch size for our tiny dataset
batch_size = 2 # Let's process 2 reviews at a time
shuffle = True # Important for training to randomize batches

sentiment_dataloader = DataLoader(sentiment_dataset, batch_size=batch_size, shuffle=shuffle)

print(f"\nIterating through DataLoader with batch_size={batch_size}:")
for i, (text_batch, label_batch) in enumerate(sentiment_dataloader):
    print(f"Batch {i+1}: Text batch shape {text_batch.shape}, Label batch shape {label_batch.shape}")
    print(f"Text batch content:\n{text_batch}")
    print(f"Label batch content:\n{label_batch}\n")

Total samples in sentiment dataset: 6

Iterating through DataLoader with batch_size=2:
Batch 1: Text batch shape torch.Size([2, 5]), Label batch shape torch.Size([2, 1])
Text batch content:
tensor([[14,  3,  8,  5,  0],
        [ 7, 10, 14, 11,  0]])
Label batch content:
tensor([[1.],
        [1.]])

Batch 2: Text batch shape torch.Size([2, 5]), Label batch shape torch.Size([2, 1])
Text batch content:
tensor([[ 9, 15,  4,  0,  0],
        [ 7,  6, 14,  3,  0]])
Label batch content:
tensor([[1.],
        [0.]])

Batch 3: Text batch shape torch.Size([2, 5]), Label batch shape torch.Size([2, 1])
Text batch content:
tensor([[13,  2, 15, 12,  0],
        [ 5, 11,  0,  0,  0]])
Label batch content:
tensor([[0.],
        [1.]])



In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# --- Re-define our Dataset (if you're running this in a new session or fresh notebook) ---
# If you ran the previous cells in this session, these variables should already be in memory.
raw_reviews = [
    "I love this movie!", "This film is great.", "The acting was terrible.",
    "I hate this film.", "Great movie!", "It was good."
]
tokenized_reviews = []
for review in raw_reviews:
    clean_review = review.lower().replace("!", "").replace(".", "").replace(",", "")
    tokens = clean_review.split()
    tokenized_reviews.append(tokens)

vocabulary_set = set()
for tokens in tokenized_reviews:
    for token in tokens:
        vocabulary_set.add(token)
special_tokens = ["<UNK>", "<PAD>"]
for token in special_tokens:
    vocabulary_set.add(token)
vocabulary = sorted(list(vocabulary_set))
word_to_idx = {word: idx for idx, word in enumerate(vocabulary)}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

max_sequence_length = 5
pad_idx = word_to_idx["<PAD>"]
numericalized_reviews = []
for tokens in tokenized_reviews:
    numericalized_tokens = [word_to_idx.get(token, word_to_idx["<UNK>"]) for token in tokens]
    if len(numericalized_tokens) < max_sequence_length:
        padded_tokens = numericalized_tokens + [pad_idx] * (max_sequence_length - len(numericalized_tokens))
    else:
        padded_tokens = numericalized_tokens[:max_sequence_length]
    numericalized_reviews.append(padded_tokens)
numericalized_reviews_tensor = torch.tensor(numericalized_reviews, dtype=torch.long)
dummy_labels = torch.tensor([1, 1, 0, 0, 1, 1], dtype=torch.float).unsqueeze(1)


# --- Redefine our TextSentimentDataset ---
class TextSentimentDataset(Dataset):
    def __init__(self, data_tensor, labels_tensor):
        self.data = data_tensor
        self.labels = labels_tensor
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

# --- Redefine our Classifier Model (adapted from MySimpleRegressor) ---
class SimpleSentimentClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, output_size):
        super(SimpleSentimentClassifier, self).__init__()
        # NEW: Embedding layer for text data!
        # It takes word indices and converts them into dense vectors (embeddings).
        # vocab_size: total number of unique words in our vocabulary
        # embedding_dim: the size of the vector representation for each word (e.g., 50, 100, 300)
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # Our input to the first linear layer will be the flattened embedding of the sequence.
        # So, input_dim for fc1 is embedding_dim * max_sequence_length
        self.fc1 = nn.Linear(embedding_dim * max_sequence_length, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size) # output_size is 1 for binary classification

    def forward(self, x):
        # x comes in as (batch_size, max_sequence_length) with word indices

        # 1. Pass through embedding layer
        # Output shape: (batch_size, max_sequence_length, embedding_dim)
        x = self.embedding(x)

        # 2. Flatten the embeddings for the feedforward layer
        # Output shape: (batch_size, max_sequence_length * embedding_dim)
        x = x.view(x.size(0), -1) # x.size(0) is batch_size, -1 infers remaining dimensions

        # 3. Pass through linear layers with ReLU activation
        x = self.fc1(x)
        x = F.relu(x)

        x = self.fc2(x)
        # For binary classification with BCEWithLogitsLoss, we typically don't apply
        # Sigmoid here. The loss function handles it internally for numerical stability.
        return x

# --- Configuration for our training ---
vocab_size = len(vocabulary)
embedding_dim = 10 # Each word will be represented by a 10-dimensional vector
hidden_dim = 20    # Number of neurons in the hidden layer
output_dim = 1     # 1 output neuron for binary classification
max_seq_len = max_sequence_length # From our preprocessing
learning_rate = 0.005
epochs = 100 # Increased epochs for small dataset to see convergence
batch_size = 2 # Small batch size for our tiny dataset

# 1. Instantiate the Dataset and DataLoader
sentiment_dataset = TextSentimentDataset(numericalized_reviews_tensor, dummy_labels)
sentiment_dataloader = DataLoader(sentiment_dataset, batch_size=batch_size, shuffle=True)

# 2. Instantiate the Model
model = SimpleSentimentClassifier(vocab_size, embedding_dim, hidden_dim, output_dim)

# 3. Define Loss Function (Binary Cross-Entropy with Logits for stability)
# BCEWithLogitsLoss combines Sigmoid and Binary Cross Entropy,
# which is more numerically stable than applying Sigmoid then BCE separately.
loss_fn = nn.BCEWithLogitsLoss()

# 4. Define Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Check if GPU is available and move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Using device: {device}")

# --- The Training Loop ---
print("\nStarting Training for Sentiment Classifier...")
for epoch in range(epochs):
    model.train() # Set model to training mode
    train_loss = 0.0

    for batch_idx, (inputs, targets) in enumerate(sentiment_dataloader):
        inputs, targets = inputs.to(device), targets.to(device)

        # Forward pass
        predictions = model(inputs)

        # Calculate loss
        loss = loss_fn(predictions, targets) # predictions are logits, targets are 0s/1s
        train_loss += loss.item()

        # Zero gradients
        optimizer.zero_grad()

        # Backward pass
        loss.backward()

        # Optimizer step
        optimizer.step()

    avg_train_loss = train_loss / len(sentiment_dataloader)

    # Simple evaluation on training data (just for quick check)
    model.eval() # Set model to evaluation mode
    with torch.no_grad(): # No need to calculate gradients for evaluation
        total_correct = 0
        total_samples = 0
        for inputs, targets in sentiment_dataloader:
            inputs, targets = inputs.to(device), targets.to(device)
            predictions = model(inputs)
            # Apply sigmoid to convert logits to probabilities, then round to 0 or 1
            predicted_classes = torch.round(torch.sigmoid(predictions))
            total_correct += (predicted_classes == targets).sum().item()
            total_samples += targets.numel()
        accuracy = total_correct / total_samples

    if (epoch + 1) % 10 == 0 or epoch == 1: # Print every 10 epochs and first epoch
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {avg_train_loss:.4f}, Accuracy: {accuracy:.4f}")

print("\nTraining Finished!")

# --- Test with a new (unknown) sentence ---
# This part won't work perfectly as the model is tiny and dataset is small,
# but it shows the inference process.
print("\n--- Testing Inference ---")
def predict_sentiment(text, model, word_to_idx, max_seq_len, device):
    model.eval() # Set to evaluation mode
    clean_text = text.lower().replace("!", "").replace(".", "").replace(",", "")
    tokens = clean_text.split()
    numericalized_tokens = [word_to_idx.get(token, word_to_idx["<UNK>"]) for token in tokens]

    if len(numericalized_tokens) < max_seq_len:
        padded_tokens = numericalized_tokens + [word_to_idx["<PAD>"]] * (max_seq_len - len(numericalized_tokens))
    else:
        padded_tokens = numericalized_tokens[:max_seq_len]

    input_tensor = torch.tensor(padded_tokens, dtype=torch.long).unsqueeze(0).to(device) # Add batch dimension

    with torch.no_grad():
        output_logits = model(input_tensor)
        prediction_prob = torch.sigmoid(output_logits).item() # Convert logits to probability
        predicted_class = "Positive" if prediction_prob >= 0.5 else "Negative"
        return predicted_class, prediction_prob

# Example new sentences
test_sentence1 = "This is a great film!"
test_sentence2 = "This movie was awful."
test_sentence3 = "It was ok." # Might be tricky for tiny model

print(f"'{test_sentence1}' -> {predict_sentiment(test_sentence1, model, word_to_idx, max_sequence_length, device)}")
print(f"'{test_sentence2}' -> {predict_sentiment(test_sentence2, model, word_to_idx, max_sequence_length, device)}")
print(f"'{test_sentence3}' -> {predict_sentiment(test_sentence3, model, word_to_idx, max_sequence_length, device)}")

Using device: cuda

Starting Training for Sentiment Classifier...
Epoch [2/100], Loss: 0.5832, Accuracy: 1.0000
Epoch [10/100], Loss: 0.1798, Accuracy: 1.0000
Epoch [20/100], Loss: 0.0313, Accuracy: 1.0000
Epoch [30/100], Loss: 0.0095, Accuracy: 1.0000
Epoch [40/100], Loss: 0.0048, Accuracy: 1.0000
Epoch [50/100], Loss: 0.0030, Accuracy: 1.0000
Epoch [60/100], Loss: 0.0021, Accuracy: 1.0000
Epoch [70/100], Loss: 0.0016, Accuracy: 1.0000
Epoch [80/100], Loss: 0.0012, Accuracy: 1.0000
Epoch [90/100], Loss: 0.0010, Accuracy: 1.0000
Epoch [100/100], Loss: 0.0008, Accuracy: 1.0000

Training Finished!

--- Testing Inference ---
'This is a great film!' -> ('Positive', 0.9802723526954651)
'This movie was awful.' -> ('Positive', 0.9640846848487854)
'It was ok.' -> ('Positive', 0.9994801878929138)


In [4]:
import torch


movie_reviews=["I love this movie!","This film is great.","The acting was terrible.","I hate this film.","Great movie!","It was good."]
moview_labels=[1,1,0,0,1,1]

tokenized_reviews= [review.lower().replace("!","").replace(".","").split() for review in movie_reviews]
print(tokenized_reviews)

all_words = [word for words in tokenized_reviews for word in words]
unique_words = set(all_words)

unique_words.add("<UNK>")
unique_words.add("<PAD>")

vocabulary=[]
vocabulary=sorted(unique_words)

word_to_idx={word : indx for indx,word in enumerate(vocabulary)}
idx_to_word={indx:word for indx, word in enumerate(vocabulary)}

max_sequence_length=5
pad_idx = word_to_idx["<PAD>"]
numericalized_padded_reviews_list=[]



for tokens in tokenized_reviews:
    
    current_numerical_sequence=[word_to_idx.get(word,word_to_idx["<UNK>"]) for word in tokens]
    
    if len(current_numerical_sequence) < max_sequence_length:
        
        padding_needed = max_sequence_length - len(current_numerical_sequence)
        final_sequence_for_nn = current_numerical_sequence + [pad_idx] * padding_needed
        
    else:
        
        final_sequence_for_nn = current_numerical_sequence[:max_sequence_length]
    
    numericalized_padded_reviews_list.append(final_sequence_for_nn)

    
numericalized_reviews_tensor = torch.tensor(numericalized_padded_reviews_list, dtype=torch.long)
labels_tensor = torch.tensor(moview_labels, dtype=torch.float).unsqueeze(1)

print("\nNumericalized Reviews as PyTorch Tensor:")
print(numericalized_reviews_tensor)
print("Tensor shape:", numericalized_reviews_tensor.shape)

print("\nLabels Tensor:")
print(labels_tensor)
print("Labels shape:", labels_tensor.shape)

[['i', 'love', 'this', 'movie'], ['this', 'film', 'is', 'great'], ['the', 'acting', 'was', 'terrible'], ['i', 'hate', 'this', 'film'], ['great', 'movie'], ['it', 'was', 'good']]

Numericalized Reviews as PyTorch Tensor:
tensor([[ 7, 10, 14, 11,  0],
        [14,  3,  8,  5,  0],
        [13,  2, 15, 12,  0],
        [ 7,  6, 14,  3,  0],
        [ 5, 11,  0,  0,  0],
        [ 9, 15,  4,  0,  0]])
Tensor shape: torch.Size([6, 5])

Labels Tensor:
tensor([[1.],
        [1.],
        [0.],
        [0.],
        [1.],
        [1.]])
Labels shape: torch.Size([6, 1])


In [2]:
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader


    

class TextSentimentDataset(Dataset):
    def __init__(self,data_tensor,label_tensor):
        super().__init__()
        self.data = data_tensor
        self.labels = label_tensor
        
    def __len__(self):
        return len(self.data)
        
    def __getitem__(self,x):
        return self.data[x],self.labels[x]


sentiment_dataset = TextSentimentDataset(numericalized_reviews_tensor,labels_tensor)

sentiment_dataloader = DataLoader(sentiment_dataset,batch_size=32,shuffle=True)


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

class SimpleSentimentClassifier(nn.Module):
    def __init__(self, vocab_size,max_sequence_length , embedding_dim, hidden_size, num_classes=2):
        super().__init__()
        # We need to define our layers here
        self.embedding_layer = nn.Embedding(vocab_size,embedding_dim)

        self.fc1 = nn.Linear(max_sequence_length *embedding_dim,hidden_size)

        self.fc2 = nn.Linear(hidden_size,num_classes)

    def forward(self,x):
        x=self.embedding_layer(x)
        x=x.view(x.size(0), -1)
        x=self.fc1(x)
        x=F.relu(x)
        return self.fc2(x)