In [2]:
import torch


movie_reviews=["I love this movie!","This film is great.","The acting was terrible.","I hate this film.","Great movie!","It was good."]
moview_labels=[1,1,0,0,1,1]

tokenized_reviews= [review.lower().replace("!","").replace(".","").split() for review in movie_reviews]
print(tokenized_reviews)

all_words = [word for words in tokenized_reviews for word in words]
unique_words = set(all_words)

unique_words.add("<UNK>")
unique_words.add("<PAD>")

vocabulary=[]
vocabulary=sorted(unique_words)

word_to_idx={word : indx for indx,word in enumerate(vocabulary)}
idx_to_word={indx:word for indx, word in enumerate(vocabulary)}

max_sequence_length=5
pad_idx = word_to_idx["<PAD>"]
numericalized_padded_reviews_list=[]



for tokens in tokenized_reviews:
    
    current_numerical_sequence=[word_to_idx.get(word,word_to_idx["<UNK>"]) for word in tokens]
    
    if len(current_numerical_sequence) < max_sequence_length:
        
        padding_needed = max_sequence_length - len(current_numerical_sequence)
        final_sequence_for_nn = current_numerical_sequence + [pad_idx] * padding_needed
        
    else:
        
        final_sequence_for_nn = current_numerical_sequence[:max_sequence_length]
    
    numericalized_padded_reviews_list.append(final_sequence_for_nn)

    
numericalized_reviews_tensor = torch.tensor(numericalized_padded_reviews_list, dtype=torch.long)
labels_tensor = torch.tensor(moview_labels, dtype=torch.float).unsqueeze(1)

print("\nNumericalized Reviews as PyTorch Tensor:")
print(numericalized_reviews_tensor)
print("Tensor shape:", numericalized_reviews_tensor.shape)

print("\nLabels Tensor:")
print(labels_tensor)
print("Labels shape:", labels_tensor.shape)

import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader


    

class TextSentimentDataset(Dataset):
    def __init__(self,data_tensor,label_tensor):
        super().__init__()
        self.data = data_tensor
        self.labels = label_tensor
        
    def __len__(self):
        return len(self.data)
        
    def __getitem__(self,x):
        return self.data[x],self.labels[x]


sentiment_dataset = TextSentimentDataset(numericalized_reviews_tensor,labels_tensor)

sentiment_dataloader = DataLoader(sentiment_dataset,batch_size=32,shuffle=True)


import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

class SimpleSentimentClassifier(nn.Module):
    def __init__(self, vocab_size,max_sequence_length , embedding_dim, hidden_size, num_classes=1):
        super().__init__()
        # We need to define our layers here
        self.embedding_layer = nn.Embedding(vocab_size,embedding_dim)

        self.fc1 = nn.Linear(max_sequence_length *embedding_dim,hidden_size)

        self.fc2 = nn.Linear(hidden_size,num_classes)

    def forward(self,x):
        x=self.embedding_layer(x)
        x=x.view(x.size(0), -1)
        x=self.fc1(x)
        x=F.relu(x)
        return self.fc2(x)

import torch


#Initializing all the obj variable to feed to sentimentCsass
vocab_size = len(vocabulary) #or maybe batch_size?
max_sequence_length = numericalized_reviews_tensor.size(1)
embedding_dim = 10
hidden_size = max_sequence_length*embedding_dim
num_classes = 1

#Object of sentimentClass
model = SimpleSentimentClassifier(vocab_size,max_sequence_length,embedding_dim,hidden_size,num_classes)

#Defining Loss function
loss_fn = nn.BCEWithLogitsLoss()

#Optimizers
param = model.parameters()
learning_rate = 0.1
optimizer = torch.optim.Adam(param,lr=learning_rate)

#Device Selection
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

    
model.to(device)


epochs=100
print("\nStarting Training for Sentiment Classifier...")

for epoch in range(epochs):
    model.train()
    train_loss = 0.0
    for inputs , target in sentiment_dataloader:
        
        inputs = inputs.to(device)
        target = target.to(device)
        
        predictions = model(inputs)
        
        loss = loss_fn(predictions,target)
        train_loss += loss.item()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    avg_train_loss = train_loss / len(sentiment_dataloader)


    model.eval()


    with torch.no_grad():
        total_correct = 0
        total_samples = 0
        
        # Loop through the dataloader again (or use a separate validation set if available)
        for inputs, target in sentiment_dataloader:
            inputs = inputs.to(device)
            target = target.to(device)
            
            predictions = model(inputs) # Get raw logits
            
            # Apply sigmoid to convert logits to probabilities, then round to get binary class (0 or 1)
            predicted_classes = torch.round(torch.sigmoid(predictions))
            
            # Count how many predictions match the true target
            total_correct += (predicted_classes == target).sum().item()
            total_samples += target.numel() # Count total number of elements (labels)
            
        accuracy = total_correct / total_samples

    # Set the model back to training mode for the next epoch
    model.train() # Important!

    # Print progress (e.g., every 10 epochs or on the first epoch)
    if (epoch + 1) % 10 == 0 or epoch == 0: # Print on epoch 1, 10, 20, ...
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {avg_train_loss:.4f}, Accuracy: {accuracy:.4f}")

print("\nTraining Finished!")

# --- (Next, we'd add the final inference/test section) ---


def predict_sentiment(text, model, word_to_idx, max_sequence_length,device):
    model.eval()
    # 1. Text Preprocessing (similar to what we did for training data)
    # Convert to lowercase and remove common punctuation
    clean_text = text.lower().replace("!", "").replace(".", "").replace(",", "")
    
    # Tokenize the sentence
    tokens = clean_text.split()
    
    # Numericalize: Convert words to indices using our word_to_idx map
    # Handle unknown words by mapping them to the <UNK> token's index
    numericalized_tokens = [word_to_idx.get(token, word_to_idx["<UNK>"]) for token in tokens]

    # Pad or Truncate the sequence to max_sequence_length
    if len(numericalized_tokens) < max_sequence_length:
        padding_needed = max_sequence_length - len(numericalized_tokens)
        padded_tokens = numericalized_tokens + [word_to_idx["<PAD>"]] * padding_needed
    else:
        padded_tokens = numericalized_tokens[:max_sequence_length]

    # Convert the processed list to a PyTorch tensor
    # .unsqueeze(0) adds a batch dimension (since our model expects batches, even for a single sample)
    input_tensor = torch.tensor(padded_tokens, dtype=torch.long).unsqueeze(0).to(device)

    # 2. Model Inference
    # Disable gradient calculations (important for inference/evaluation)
    with torch.no_grad():
        # Get raw logits from the model
        output_logits = model(input_tensor)
        
        # Apply sigmoid to convert logits to a probability between 0 and 1
        prediction_prob = torch.sigmoid(output_logits).item() # .item() gets the Python number from a 1-element tensor

    # 3. Determine the final class prediction
    predicted_class = "Positive" if prediction_prob >= 0.5 else "Negative"
    
    return predicted_class, prediction_prob

# --- Testing with new sentences (Add this after the function definition) ---
print("\n--- Testing Inference on New Sentences ---")

test_sentence1 = "This is a great film! I really enjoyed it."
test_sentence2 = "This movie was absolutely awful and boring."
test_sentence3 = "It was just okay." # A tricky one, might be difficult for a tiny model trained on tiny data

# Call the function to predict sentiment for each sentence
sentiment1, prob1 = predict_sentiment(test_sentence1, model, word_to_idx, max_sequence_length, device)
print(f"'{test_sentence1}' -> Predicted: {sentiment1} (Probability: {prob1:.4f})")

sentiment2, prob2 = predict_sentiment(test_sentence2, model, word_to_idx, max_sequence_length, device)
print(f"'{test_sentence2}' -> Predicted: {sentiment2} (Probability: {prob2:.4f})")

sentiment3, prob3 = predict_sentiment(test_sentence3, model, word_to_idx, max_sequence_length, device)
print(f"'{test_sentence3}' -> Predicted: {sentiment3} (Probability: {prob3:.4f})")

[['i', 'love', 'this', 'movie'], ['this', 'film', 'is', 'great'], ['the', 'acting', 'was', 'terrible'], ['i', 'hate', 'this', 'film'], ['great', 'movie'], ['it', 'was', 'good']]

Numericalized Reviews as PyTorch Tensor:
tensor([[ 7, 10, 14, 11,  0],
        [14,  3,  8,  5,  0],
        [13,  2, 15, 12,  0],
        [ 7,  6, 14,  3,  0],
        [ 5, 11,  0,  0,  0],
        [ 9, 15,  4,  0,  0]])
Tensor shape: torch.Size([6, 5])

Labels Tensor:
tensor([[1.],
        [1.],
        [0.],
        [0.],
        [1.],
        [1.]])
Labels shape: torch.Size([6, 1])

Starting Training for Sentiment Classifier...
Epoch [1/100], Loss: 0.7170, Accuracy: 1.0000
Epoch [10/100], Loss: 0.0000, Accuracy: 1.0000
Epoch [20/100], Loss: 0.0000, Accuracy: 1.0000
Epoch [30/100], Loss: 0.0000, Accuracy: 1.0000
Epoch [40/100], Loss: 0.0000, Accuracy: 1.0000
Epoch [50/100], Loss: 0.0000, Accuracy: 1.0000
Epoch [60/100], Loss: 0.0000, Accuracy: 1.0000
Epoch [70/100], Loss: 0.0000, Accuracy: 1.0000
Epoch [80/

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import time # To track training time
import numpy as np # Often used for numerical operations

# --- 1. Data Definition ---
movie_reviews = [
    "I love this movie!",
    "This film is great.",
    "The acting was terrible.",
    "I hate this film.",
    "Great movie!",
    "It was good.",
]
movie_labels = [1, 1, 0, 0, 1, 1] # 1 for positive, 0 for negative

# --- 2. Text Preprocessing ---
def preprocess_text(reviews, max_seq_len, word_to_idx, pad_idx, unk_idx):
    """
    Tokenizes, numericalizes, and pads/truncates a list of raw text reviews.
    """
    tokenized_reviews = [
        review.lower().replace("!", "").replace(".", "").replace(",", "").split()
        for review in reviews
    ]

    numericalized_padded_reviews_list = []
    for tokens in tokenized_reviews:
        # Numericalize: Convert words to indices, handling unknown words
        current_numerical_sequence = [
            word_to_idx.get(word, unk_idx) for word in tokens
        ]

        # Pad or Truncate: Make all sequences max_sequence_length
        if len(current_numerical_sequence) < max_seq_len:
            padding_needed = max_seq_len - len(current_numerical_sequence)
            final_sequence_for_nn = current_numerical_sequence + [pad_idx] * padding_needed
        else:
            final_sequence_for_nn = current_numerical_sequence[:max_seq_len]

        numericalized_padded_reviews_list.append(final_sequence_for_nn)

    return numericalized_padded_reviews_list

# --- Build Vocabulary (needs to be done once from all data) ---
all_words = [word for review_tokens in (
    r.lower().replace("!", "").replace(".", "").replace(",", "").split() for r in movie_reviews
) for word in review_tokens]

unique_words = set(all_words)
unique_words.add("<UNK>") # For unknown words
unique_words.add("<PAD>") # For padding shorter sequences

vocabulary = sorted(list(unique_words)) # Ensure consistent order
word_to_idx = {word: idx for idx, word in enumerate(vocabulary)}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

# Define necessary preprocessing constants
MAX_SEQUENCE_LENGTH = 5 # Max length of a review sequence
PAD_IDX = word_to_idx["<PAD>"]
UNK_IDX = word_to_idx["<UNK>"]
VOCAB_SIZE = len(vocabulary)

# Preprocess the data using the defined function and constants
numericalized_reviews_list = preprocess_text(
    movie_reviews, MAX_SEQUENCE_LENGTH, word_to_idx, PAD_IDX, UNK_IDX
)

# Convert preprocessed data and labels to PyTorch tensors
numericalized_reviews_tensor = torch.tensor(
    numericalized_reviews_list, dtype=torch.long
)
labels_tensor = torch.tensor(movie_labels, dtype=torch.float).unsqueeze(1) # Unsqueeze for BCEWithLogitsLoss

# --- 3. Custom Dataset and DataLoader ---
class TextSentimentDataset(Dataset):
    def __init__(self, data_tensor, label_tensor):
        self.data = data_tensor
        self.labels = label_tensor

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

# --- 4. Define the Model (SimpleSentimentClassifier) ---
class SimpleSentimentClassifier(nn.Module):
    def __init__(
        self, vocab_size, max_sequence_length, embedding_dim, hidden_size, num_classes=1
    ):
        super().__init__()
        # Embedding layer: converts word indices to dense vectors
        self.embedding_layer = nn.Embedding(vocab_size, embedding_dim)

        # First fully connected layer (input to hidden)
        # Input size is flattened sequence length * embedding dimension
        self.fc1 = nn.Linear(max_sequence_length * embedding_dim, hidden_size)

        # Second fully connected layer (hidden to output)
        self.fc2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        # x is (batch_size, max_sequence_length) with word indices
        
        # 1. Pass through embedding layer
        # Output shape: (batch_size, max_sequence_length, embedding_dim)
        x = self.embedding_layer(x)
        
        # 2. Flatten the embeddings for the feedforward layer
        # Output shape: (batch_size, max_sequence_length * embedding_dim)
        x = x.view(x.size(0), -1) # -1 infers the dimension based on total elements

        # 3. Pass through first linear layer with ReLU activation
        x = self.fc1(x)
        x = F.relu(x)

        # 4. Pass through final linear layer (outputs raw logits)
        return self.fc2(x)

# --- 5. Inference Function ---
def predict_sentiment(text, model, word_to_idx, max_seq_len, device, unk_idx, pad_idx):
    """
    Predicts the sentiment of a single raw text string using the trained model.
    """
    model.eval() # Set model to evaluation mode
    
    # Preprocess the input text (same logic as training data)
    clean_text = text.lower().replace("!", "").replace(".", "").replace(",", "")
    tokens = clean_text.split()
    numericalized_tokens = [word_to_idx.get(token, unk_idx) for token in tokens]

    if len(numericalized_tokens) < max_seq_len:
        padded_tokens = numericalized_tokens + [pad_idx] * (max_seq_len - len(numericalized_tokens))
    else:
        padded_tokens = numericalized_tokens[:max_seq_len]

    # Convert to PyTorch tensor and add batch dimension, move to device
    input_tensor = torch.tensor(padded_tokens, dtype=torch.long).unsqueeze(0).to(device)

    # Perform inference (no gradient calculation)
    with torch.no_grad():
        output_logits = model(input_tensor)
        prediction_prob = torch.sigmoid(output_logits).item() # Convert logits to probability

    predicted_class = "Positive" if prediction_prob >= 0.5 else "Negative"
    
    return predicted_class, prediction_prob


# --- Main Training and Testing Execution Block ---
def main():
    # --- Training Configuration ---
    EMBEDDING_DIM = 10
    HIDDEN_SIZE = MAX_SEQUENCE_LENGTH * EMBEDDING_DIM # Can also be an independent choice
    NUM_CLASSES = 2
    LEARNING_RATE = 0.005
    EPOCHS = 100
    BATCH_SIZE = 2 # Small for tiny dataset

    # --- Data Loading ---
    sentiment_dataset = TextSentimentDataset(numericalized_reviews_tensor, labels_tensor)
    sentiment_dataloader = DataLoader(sentiment_dataset, batch_size=BATCH_SIZE, shuffle=True)

    # --- Model Instantiation ---
    model = SimpleSentimentClassifier(
        VOCAB_SIZE, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM, HIDDEN_SIZE, NUM_CLASSES
    )

    # --- Loss Function and Optimizer Definition ---
    loss_fn = nn.BCEWithLogitsLoss() # Combines Sigmoid and Binary Cross-Entropy for stability
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

    # --- Device Setup (CPU vs GPU) ---
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    print(f"Using device for training: {device}")

    # --- Training Loop ---
    print("\nStarting Training for Sentiment Classifier...")
    start_time = time.time()

    for epoch in range(EPOCHS):
        model.train() # Set model to training mode
        train_loss = 0.0
        
        for inputs, targets in sentiment_dataloader:
            inputs, targets = inputs.to(device), targets.to(device) # Move data to device

            # Forward pass, loss calculation, backpropagation, and optimization step
            predictions = model(inputs)
            loss = loss_fn(predictions, targets)
            train_loss += loss.item()

            optimizer.zero_grad() # Zero gradients
            loss.backward()       # Backpropagation
            optimizer.step()      # Update weights

        avg_train_loss = train_loss / len(sentiment_dataloader)

        # --- Evaluation on training data (simple check, not for true performance) ---
        model.eval() # Set model to evaluation mode
        with torch.no_grad(): # Disable gradient calculations
            total_correct = 0
            total_samples = 0
            for inputs, targets in sentiment_dataloader:
                inputs, targets = inputs.to(device), targets.to(device)
                predictions = model(inputs)
                predicted_classes = torch.round(torch.sigmoid(predictions)) # Probabilities to 0/1 classes
                total_correct += (predicted_classes == targets).sum().item()
                total_samples += targets.numel()
            accuracy = total_correct / total_samples

        # Print progress report
        if (epoch + 1) % 10 == 0 or epoch == 1:
            print(f"Epoch [{epoch+1}/{EPOCHS}], Loss: {avg_train_loss:.4f}, Accuracy: {accuracy:.4f}")

    end_time = time.time()
    print(f"\nTraining Finished! Total time: {end_time - start_time:.2f} seconds")

    # --- Testing Inference on New Sentences ---
    print("\n--- Testing Inference on New Sentences ---")
    test_sentence1 = "This is a great film! I really enjoyed it."
    test_sentence2 = "This movie was absolutely awful and boring."
    test_sentence3 = "It was just okay." # A tricky one, might be difficult for a tiny model trained on tiny data

    sentiment1, prob1 = predict_sentiment(test_sentence1, model, word_to_idx, MAX_SEQUENCE_LENGTH, device, UNK_IDX, PAD_IDX)
    print(f"'{test_sentence1}' -> Predicted: {sentiment1} (Probability: {prob1:.4f})")

    sentiment2, prob2 = predict_sentiment(test_sentence2, model, word_to_idx, MAX_SEQUENCE_LENGTH, device, UNK_IDX, PAD_IDX)
    print(f"'{test_sentence2}' -> Predicted: {sentiment2} (Probability: {prob2:.4f})")

    sentiment3, prob3 = predict_sentiment(test_sentence3, model, word_to_idx, MAX_SEQUENCE_LENGTH, device, UNK_IDX, PAD_IDX)
    print(f"'{test_sentence3}' -> Predicted: {sentiment3} (Probability: {prob3:.4f})")

# This ensures main() runs when the script is executed
if __name__ == "__main__":
    main()

Using device for training: cuda

Starting Training for Sentiment Classifier...


ValueError: Target size (torch.Size([2, 1])) must be the same as input size (torch.Size([2, 2]))

# 💼 Interview Preparation Notes: Applied NLP & LLMs

This section compiles key concepts, common questions, and strategic points for technical and behavioral interviews for Applied NLP Engineer roles, drawing directly from the learning roadmap and projects.

---

## Module 1: Deep Learning Fundamentals (Interview Focus)

### 1. PyTorch Tensors

* **Concise Definition:** PyTorch Tensors are multi-dimensional arrays, similar to NumPy arrays, but optimized for deep learning computations, especially on GPUs, and are the fundamental data structure for all operations in PyTorch.
* **"Why it Matters" / Importance:**
    * **GPU Acceleration:** Tensors can be easily moved to a GPU (`.to('cuda')`), which performs parallel computations vastly faster than a CPU, crucial for training large neural networks and LLMs.
    * **Automatic Differentiation (Autograd):** Tensors are the foundation for Autograd, enabling automatic gradient computation, which is vital for model learning.
    * **Universal Data Format:** All inputs, outputs, and model parameters (weights, biases) in PyTorch are Tensors.
* **Key PyTorch API:**
    * `torch.tensor([...])`: Create tensor from Python list.
    * `torch.zeros()`, `torch.ones()`, `torch.rand()`, `torch.randn()`: Create tensors with specific initial values.
    * `.shape`, `.size()`: Get the dimensions of a tensor.
    * `.dtype`: Get the data type of a tensor.
    * `.to('cuda')` / `.to('cpu')`: Move tensor between GPU and CPU.
    * `.view()`, `.reshape()`: Reshape tensors (e.g., for "flattening").
* **Common Pitfalls/Considerations:**
    * **CPU vs. GPU:** Ensure both model and input data are on the same device.
    * **Data Types:** Be mindful of `torch.float32` (common for calculations) vs. `torch.long` (common for indices/integers).
    * **"Why Tensors over Python lists or NumPy arrays?":** For mathematical operations, NumPy is fine, but Tensors offer *built-in GPU acceleration* and *gradient tracking (Autograd)*, which NumPy does not. Python lists are too slow for numerical computations.

### 2. Autograd and Calculus for ML

* **Concise Definition:** Autograd is PyTorch's automatic differentiation engine that calculates gradients for all operations on tensors, enabling neural networks to learn by knowing how to adjust their parameters to minimize loss.
* **"Why it Matters" / Importance:**
    * **Enables Learning:** It's the core mechanism for backpropagation and gradient descent, allowing models to adjust weights iteratively.
    * **Eliminates Manual Derivatives:** Automates the extremely complex and error-prone process of calculating derivatives for millions/billions of parameters.
    * **Flexibility (Dynamic Graphs):** Autograd builds the computation graph on-the-fly, allowing for dynamic network structures (e.g., conditional logic, variable sequence lengths).
* **Key PyTorch API:**
    * `tensor.requires_grad = True`: Flag a tensor to track operations for gradient computation.
    * `loss.backward()`: Triggers the backward pass, computing gradients of the loss with respect to all `requires_grad=True` tensors.
    * `optimizer.zero_grad()`: Clears accumulated gradients before a new backward pass (crucial!).
    * `with torch.no_grad():`: Context manager to temporarily disable gradient tracking (for inference/evaluation).
* **Connecting Concepts (Calculus):**
    * **Gradient:** A vector indicating the direction of steepest ascent of a function (we move in the opposite direction to minimize loss).
    * **Partial Derivatives:** Used when a function has multiple inputs, measuring change with respect to one variable while holding others constant.
    * **Chain Rule:** The fundamental calculus rule Autograd uses to efficiently compute gradients across multiple layers of a network (derivative of composite functions).
* **Common Pitfalls/Debugging:**
    * **Forgetting `optimizer.zero_grad()`:** Leads to gradients accumulating from previous steps, causing incorrect updates and convergence issues.
    * **Accidental `requires_grad=True` in evaluation:** Wastes memory and computation. Use `torch.no_grad()`.
    * **Detaching tensors:** Sometimes you need to `.detach()` a tensor to remove it from the computation graph if you don't need its gradients tracked.

### 3. Neural Network Building Blocks (Neurons, Layers, Activation Functions)

* **Concise Definition:** Neural networks are compositions of interconnected "neurons" organized into "layers," which process data through weighted sums, biases, and non-linear activation functions to learn complex patterns.
* **"Why it Matters" / Importance:** These are the fundamental components that allow deep learning models to approximate complex real-world functions and learn from data.
* **Key Concepts:**
    * **Neuron (Node):** Basic unit. Takes inputs, multiplies by weights, adds bias, applies activation.
    * **Weights & Biases:** Learnable parameters adjusted during training to capture patterns and shift outputs.
    * **Activation Functions:** Introduce non-linearity. Without them, a deep network would just be a linear model.
        * **ReLU (`F.relu`):** `max(0, x)`. Efficient, common in hidden layers.
        * **Sigmoid (`torch.sigmoid`):** Squashes to (0,1). Good for binary classification output probabilities.
        * **Softmax (`F.softmax`):** Converts raw scores to a probability distribution (sum to 1). Good for multi-class classification output.
    * **Layers:**
        * **Input Layer:** Receives raw features.
        * **Hidden Layer(s):** Perform intermediate computations, learn complex features.
        * **Output Layer:** Produces final prediction, tailored to the task.
    * **Forward Pass:** Data flowing sequentially through layers from input to output to generate predictions.
* **`nn.Module` (Recap):** The base class for all network layers and models in PyTorch. It tracks parameters and handles GPU/CPU moves.
* **`nn.Linear`:** A "fully connected" layer. Performs `Output = (Input * Weights) + Bias`. Used for general feature combination and transformation.
* **`nn.Embedding` (Crucial for Text):**
    * **What it is:** A specialized layer that takes **integer word indices** as input and outputs **dense, continuous vectors (embeddings)** for each index. These vectors are learnable.
    * **"Why more vectors (embeddings) instead of just indices?":** Integer IDs are arbitrary and carry no semantic meaning or relationship (e.g., word "apple" is 1, "banana" is 2 - no inherent similarity). Embeddings learn to place semantically similar words close together in a multi-dimensional space, capturing meaning and context (e.g., vector for "king" is close to "queen").
    * **"Why `nn.Embedding` as the first layer?":** Because the network needs a meaningful, numerical representation of words *before* it can perform any "thinking" (linear transformations) about sentiment or other text patterns. `nn.Embedding` is the "translator" from meaningless ID to meaningful profile. Higher `embedding_dim` allows for richer profiles.
* **Common Pitfalls/Considerations:**
    * **Non-linearity:** Forgetting activation functions means your network can only learn linear relationships.
    * **Output Layer Activation:** Choosing the correct final activation based on task (Sigmoid for binary prob, Softmax for multi-class prob, none for regression).

### 4. Loss Functions and Optimizers

* **Concise Definition:** Loss functions quantify a model's prediction error, while optimizers use gradients to adjust model parameters to minimize that error.
* **"Why it Matters" / Importance:** They form the core feedback loop that allows neural networks to "learn" and improve their performance over time.
* **Key Concepts:**
    * **Loss Function:** (Also Cost/Objective Function) A single number representing "how wrong" the model's prediction is. Goal: minimize.
        * **Mean Squared Error (MSE) Loss (`nn.MSELoss`):** For **regression** (predicting continuous numbers). Penalizes larger errors more.
        * **Binary Cross-Entropy (BCE) Loss:**
            * **What it is:** Measures dissimilarity between predicted probabilities and true binary labels. Heavily penalizes confident wrong predictions.
            * **`nn.BCEWithLogitsLoss`:** **Preferred for binary classification in PyTorch.** It takes raw model outputs (logits), *internally applies a Sigmoid activation*, and then calculates BCE loss. This combination is **numerically more stable** than applying Sigmoid and `nn.BCELoss` separately.
    * **Optimizer:** Algorithm that updates model weights based on gradients.
        * **Learning Rate (`lr`):** The step size for parameter updates. Critical hyperparameter. Too small = slow learning, stuck. Too large = overshoot, oscillate, diverge.
        * **Stochastic Gradient Descent (SGD):** Basic optimizer, moves weights opposite to gradient.
        * **Adam (`torch.optim.Adam`):** Popular, adaptive optimizer. Adjusts learning rate for each parameter based on historical gradients, often leading to faster and more stable convergence.
* **Common Pitfalls/Debugging:**
    * **Choosing Wrong Loss:** Using MSE for classification or BCE for multi-class.
    * **Learning Rate Tuning:** Model not learning (too low) or diverging (too high).
    * **Zeroing Gradients:** Forgetting `optimizer.zero_grad()` leads to accumulated gradients and incorrect updates.

### 5. Data Preparation (`Dataset` & `DataLoader`)

* **Concise Definition:** `Dataset` provides an interface to individual data samples, while `DataLoader` efficiently batches, shuffles, and loads these samples for training.
* **"Why it Matters" / Importance:** Essential for handling large datasets efficiently and preparing them for neural network training.
* **Key Concepts:**
    * **`torch.utils.data.Dataset`:** "The Librarian." Represents the collection of samples. You create a custom class implementing:
        * `__len__(self)`: Returns total number of samples.
        * `__getitem__(self, idx)`: Returns a single `(input_data, label)` pair for a given index.
    * **`torch.utils.data.DataLoader`:** "The Delivery Service." Wraps a `Dataset` and provides:
        * **Batching:** Grouping samples into mini-batches for efficient processing by GPUs.
        * **Shuffling:** Randomizing data order for better generalization.
        * **`num_workers`:** (Optional) Parallel data loading using multiple CPU processes to prevent GPU idle time.
* **Common Pitfalls/Debugging:**
    * **`__len__` or `__getitem__` errors:** Incorrect implementation leads to indexing issues.
    * **`num_workers > 0` on Windows:** Can sometimes cause issues (related to multiprocessing), `num_workers=0` (main process) is safer on Windows if problems arise.
    * **Data not on device:** Forgetting to `inputs.to(device)` and `targets.to(device)` inside the training loop.

### 6. The Complete Training Loop

* **Concise Definition:** The iterative cycle where a model learns from data by making predictions, measuring error, computing adjustments, and updating its parameters over many epochs and batches.
* **"Why it Matters" / Importance:** This is the heart of deep learning; it's how models are actually "trained" to perform their task.
* **Key Steps (The 5-Step Cycle per Batch):**
    1.  **Forward Pass:** `predictions = model(inputs)` (Model makes a guess).
    2.  **Calculate Loss:** `loss = loss_fn(predictions, targets)` (Measures "wrongness").
    3.  **Zero Gradients:** `optimizer.zero_grad()` (Clears previous adjustment instructions).
    4.  **Backward Pass:** `loss.backward()` (Computes *new* adjustments/gradients).
    5.  **Optimizer Step:** `optimizer.step()` (Applies the adjustments to weights).
* **Monitoring & Evaluation:**
    * `model.train()`: Sets model to training mode (e.g., enables dropout). Call at start of epoch.
    * `model.eval()`: Sets model to evaluation/inference mode (e.g., disables dropout). Call before evaluation.
    * `with torch.no_grad():`: Disables gradient tracking during evaluation/inference to save memory and speed.
    * **Accuracy Calculation:** `torch.round(torch.sigmoid(predictions)) == targets` then `.sum().item()` and divide by `total_samples`.
* **Common Pitfalls/Debugging:**
    * Forgetting any of the 5 core steps.
    * Not switching `model.train()` / `model.eval()`.
    * Not using `with torch.no_grad()` for evaluation.
    * `Target size` / `Input size` mismatches (often due to `num_classes` or `unsqueeze` issues).
    * Overfitting (low training loss/high training accuracy, but poor test performance).

### 7. Text Preprocessing (Mini-Project Specific)

* **Concise Definition:** Converting raw human language into a numerical, fixed-length format that a neural network can process.
* **Key Steps:**
    * **Tokenization:** Breaking text into words (e.g., `sentence.lower().replace().split()`).
    * **Vocabulary Creation:** Building a dictionary of unique words and assigning them integer IDs, including `<UNK>` (for unknown words) and `<PAD>` (for padding).
    * **Numericalization:** Replacing words with their corresponding integer IDs.
    * **Padding/Truncation:** Making all sequences the same length by adding `PAD` tokens or cutting longer sequences.
* **"Why not code from scratch in real world?":**
    * While we built it from scratch for *conceptual understanding*, in production, you use highly optimized libraries (e.g., Hugging Face `tokenizers`, spaCy, NLTK).
    * These libraries handle complex edge cases, are faster (often in C++/Rust), and are battle-tested.
    * **Your Value:** Understanding the *concepts* from scratch allows you to effectively *use, debug, and customize* these powerful libraries.

### 8. Debugging Overfitting (from Mini-Project Experience)

* **What it is:** When a model performs very well on the training data (memorization) but poorly on new, unseen data (fails to generalize).
* **Symptoms:** Low training loss + high training accuracy, but bad performance on a separate validation/test set. (As seen in your mini-project with 100% accuracy on 6 sentences, but wrong predictions on new ones).
* **Common Causes (as seen in mini-project):**
    * **Extremely Small Dataset:** Not enough diverse examples to learn general rules.
    * **Too Many Epochs on Small Data:** Reinforces memorization.
    * **Overly Complex Model for Data Size:** Even simple models can overfit tiny datasets.
* **Solutions (Conceptually):**
    * **More Data (Best Solution):** Gather more diverse training examples.
    * **Regularization:** Techniques like Dropout (randomly turning off neurons) or L1/L2 regularization (penalizing large weights).
    * **Early Stopping:** Stop training when performance on a *separate validation set* starts to degrade, even if training loss still goes down.
    * **Cross-Validation:** For very small datasets, using techniques like k-fold cross-validation can give more robust evaluation.

---