In [3]:
import torch
import numpy as np
import torch.functional as F 


In [None]:
# This is a vector

a = torch.tensor([
    1,2,3,4,5
])
print(a.ndim)
print(a.shape)


1
torch.Size([5])


In [6]:
# This is a MATRIX

b = torch.tensor([
    [1,2,3,4],
    [3,2,4,5]
])
print(a.ndim)
print( a.shape)

1
torch.Size([5])


In [8]:
# THIS IS A TENSOR

c = torch.tensor([
    [
        [1,2,3],
        [4,5,6],
        [7,8,9]
    ]
])
print(c.ndim)
print(c.shape)

3
torch.Size([1, 3, 3])


| **Name**  | **What is it?**                                                                 | **Number of dimensions** | **Lower or upper (usually/example)** |
|-----------|----------------------------------------------------------------------------------|---------------------------|----------------------------------------|
| **scalar**| a single number                                                                  | 0                         | Lower ( *a* )                          |
| **vector**| a number with direction (e.g. wind speed with direction) but can also have many other numbers | 1                         | Lower ( *y* )                          |
| **matrix**| a 2-dimensional array of numbers                                                 | 2                         | Upper ( *Q* )                          |
| **tensor**| an n-dimensional array of numbers. Can be any number. A 0-dim tensor is a scalar, a 1-dim tensor is a vector | n                         | Upper ( *X* )                          |


In [None]:
# 🔹 1. torch.rand()

# ➤ What it does:
# Returns a tensor filled with random numbers from a uniform distribution over [0, 1).

d = torch.rand
d

# ➤ Use when:
# You need values between 0 and 1 (e.g., initializing weights, dropout masks).

tensor([[0.1417, 0.8812, 0.7229],
        [0.8477, 0.3155, 0.9697]])

In [None]:
# 🔹 2. torch.randn()

# ➤ What it does:
# Returns a tensor filled with random numbers from a normal (Gaussian) distribution, with mean = 0 and std = 1.

d = torch.randn(2,3)
d

# ➤ Use when:
# You want random noise or weight initialization centered around 0.

tensor([[ 1.9868,  0.9877, -1.1945],
        [ 0.5119, -0.2236,  0.7186]])

| Function             | Distribution    | Range   | Shape Source | Typical Use Case                 |
| -------------------- | --------------- | ------- | ------------ | -------------------------------- |
| `torch.rand()`       | Uniform         | \[0, 1) | Manual       | Weights, probabilities           |
| `torch.randn()`      | Normal (mean=0) | (-∞, ∞) | Manual       | Noise, weight init               |
| `torch.rand_like()`  | Uniform         | \[0, 1) | Like input   | Dropout, masks like input tensor |
| `torch.randn_like()` | Normal (mean=0) | (-∞, ∞) | Like input   | Add noise to activations, etc.   |


| Term | Meaning                        | Effect on Distribution     |
| ---- | ------------------------------ | -------------------------- |
| Mean | Central value (μ)              | Shifts curve left/right    |
| Std  | Standard deviation (σ)         | Stretches/squeezes curve   |
| 0, 1 | "Standard normal" distribution | Default in `torch.randn()` |


In [15]:
# zeros and ones 


a = torch.zeros(3,4)
print(a)

b = torch.randn(4,4)
c = torch.zeros_like(b)
print(c)

tensor([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]])
tensor([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]])


In [17]:
d = torch.ones(2,3)
e = torch.ones_like(b)
print(d)
print(e)

tensor([[1., 1., 1.],
        [1., 1., 1.]])
tensor([[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]])


In [18]:
# CREATING A RANGE OF TENSORS

a = torch.arange(start=0, end=1, step=0.1)
a

tensor([0.0000, 0.1000, 0.2000, 0.3000, 0.4000, 0.5000, 0.6000, 0.7000, 0.8000,
        0.9000])

In [21]:
# CREATING A RANGE OF TENSORS

a = torch.arange(start=0, end=1, step=0.25)
a

tensor([0.0000, 0.2500, 0.5000, 0.7500])


## 🔸 Basic Tensor Arithmetic

### ➤ Common Operators:

| Operation      | Symbol | Function      | Example       |
| -------------- | ------ | ------------- | ------------- |
| Addition       | `+`    | `torch.add()` | `tensor + 10` |
| Subtraction    | `-`    | `torch.sub()` | `tensor - 10` |
| Multiplication | `*`    | `torch.mul()` | `tensor * 10` |
| Division       | `/`    | `torch.div()` | `tensor / 10` |

### ➤ Element-wise Example:

```python
tensor = torch.tensor([1, 2, 3])
tensor + 10           # tensor([11, 12, 13])
tensor * 10           # tensor([10, 20, 30])
tensor * tensor       # tensor([1, 4, 9])
```

**Important:** Tensors do not mutate unless reassigned.

---

## 🔸 Matrix Multiplication (The Core of Deep Learning)

### ➤ Core Function:

```python
torch.matmul(A, B)  # or simply A @ B
```

### ➤ Rules:

* **Inner dimensions must match**: `(m x n) @ (n x p)` ✅
* **Output shape**: `(m x p)`

### ➤ Example:

```python
A = torch.tensor([1, 2, 3])
A @ A         # 1*1 + 2*2 + 3*3 = 14
```

### ➤ Tensor Matrix Shapes:

```python
tensor_A: (3 x 2)
tensor_B: (3 x 2) ❌
tensor_B.T: (2 x 3) ✅
torch.matmul(tensor_A, tensor_B.T)  # (3 x 2) @ (2 x 3) -> (3 x 3)
```

---

## 🔸 Shape Errors & Transpose

* Most common PyTorch error: **shape mismatch** in matrix operations.
* Fix using:

```python
torch.transpose(tensor, 0, 1)
tensor.T  # shorthand
```

---


In [37]:
# 🔸 1. torch.reshape() — Change the shape
# Changes the shape of a tensor (as long as the total number of elements stays the same).

x = torch.arange(6)         # tensor([0, 1, 2, 3, 4, 5])
x_reshaped = torch.reshape(x, (2, 3))      # or we can also do x.reshape(2,3)
print(x_reshaped)

tensor([[0, 1, 2],
        [3, 4, 5]])


In [38]:
# 🔸 2. tensor.view() — Like reshape but faster
# Creates a view (shared memory) of a tensor in a new shape.

x = torch.tensor([[1, 2, 3], [4, 5, 6]])
viewed = x.view(3, 2)
print(viewed)

tensor([[1, 2],
        [3, 4],
        [5, 6]])


In [39]:
# 🔸 3. torch.stack() — Stack tensors along a new dim
# Used in multi-head attention, batch creation, etc.

a = torch.tensor([1, 2, 3])
b = torch.tensor([4, 5, 6])
stacked = torch.stack([a, b], dim=0)  # New 0-th dimension
print(stacked)

tensor([[1, 2, 3],
        [4, 5, 6]])


In [40]:
# 🔸 4. torch.squeeze() — Remove dims of size 1
# Cleans up shape. Often used after models.

x = torch.tensor([[[1], [2], [3]]])
print(x.shape)             # (1, 3, 1)
print(torch.squeeze(x).shape)  # (3,)
print(torch.squeeze(x))

torch.Size([1, 3, 1])
torch.Size([3])
tensor([1, 2, 3])


In [None]:
# 🔸 5. torch.unsqueeze() — Add a 1-dim
# Prepares data for broadcasting or adding batch/time/head dims.

x = torch.tensor([1, 2, 3])
x_unsq = torch.unsqueeze(x, 0)
print(x_unsq.shape)  # torch.Size([1, 3])
print(x_unsq)

# 🧠 Used in attention like:
# (batch, seq_len) → (batch, 1, seq_len) to match matrix shapes.

torch.Size([1, 3])
tensor([[1, 2, 3]])


In [None]:
# 🔸 6. torch.permute() — Reorder dimensions
# Absolutely crucial for attention where dimension order matters.

x = torch.rand(2, 3, 4)  # (batch, seq_len, features)
print(x)
x_perm = x.permute(0, 2, 1)
print(x_perm.shape)  # torch.Size([2, 4, 3])
print("")
print("")
print(x_perm)


# How the above example worked
# when we permute the 2nd and 3rd dimension , we just make the rows into columns


# 🧠 Why care?
# Transformers expect inputs in (batch, heads, seq_len, head_dim)
# Attention scores require precise shape alignment

tensor([[[0.8863, 0.7747, 0.7695, 0.5216],
         [0.4060, 0.1577, 0.3718, 0.7764],
         [0.0636, 0.6644, 0.3799, 0.9512]],

        [[0.6755, 0.0513, 0.0047, 0.6641],
         [0.8687, 0.3457, 0.8124, 0.3000],
         [0.8270, 0.1543, 0.2062, 0.5369]]])
torch.Size([2, 4, 3])


tensor([[[0.8863, 0.4060, 0.0636],
         [0.7747, 0.1577, 0.6644],
         [0.7695, 0.3718, 0.3799],
         [0.5216, 0.7764, 0.9512]],

        [[0.6755, 0.8687, 0.8270],
         [0.0513, 0.3457, 0.1543],
         [0.0047, 0.8124, 0.2062],
         [0.6641, 0.3000, 0.5369]]])


| Function    | Purpose                               | Example Shape Change  |
| ----------- | ------------------------------------- | --------------------- |
| `reshape`   | Change shape                          | (6,) → (2, 3)         |
| `view`      | Same as reshape, faster when possible | (2, 3) → (3, 2)       |
| `unsqueeze` | Add 1 dim                             | (3,) → (1, 3)         |
| `squeeze`   | Remove 1-dims                         | (1, 3, 1) → (3,)      |
| `stack`     | Stack along new dim                   | \[a, b] (3,) → (2, 3) |
| `permute`   | Reorder dims                          | (B, S, H) → (B, H, S) |


In [None]:
# 🔍 Visual Explanation:
# Let’s assume a tensor with shape (2, 3, 4) representing:
# 2 images
# 3 rows (height)
# 4 columns (width)
# Original Shape: (H, W, C) = (224, 224, 3)

# Before:
#   Axis 0: Height → 224 rows
#   Axis 1: Width  → 224 columns
#   Axis 2: Channels → [R, G, B]

# After permute(2, 0, 1):
#   Axis 0: Channels → Now first axis (3)
#   Axis 1: Height   → Now second axis (224)
#   Axis 2: Width    → Now third axis (224)
# You're not changing the pixel values — you're just changing how you're reading them.





# 💡 TL;DR Intuition
# permute() is like rotating the axes of a Rubik's cube — it changes how you look at the data, not the data itself.

| Action        | What Happens                                         |
| ------------- | ---------------------------------------------------- |
| `permute()`   | Changes axis **order** only (no copying)             |
| `reshape()`   | Changes shape (requires same # elements)             |
| `view()`      | Same as reshape but faster (if memory layout allows) |
| `transpose()` | Special case of permute (only 2 dims swapped)        |


## Accessing data from tensors

In [46]:
# Create a tensor 
import torch
x = torch.arange(1, 10).reshape(1, 3, 3)
x, x.shape

(tensor([[[1, 2, 3],
          [4, 5, 6],
          [7, 8, 9]]]),
 torch.Size([1, 3, 3]))

In [47]:
# Let's index bracket by bracket
print(f"First square bracket:\n{x[0]}") 
print(f"Second square bracket: {x[0][0]}") 
print(f"Third square bracket: {x[0][0][0]}")

First square bracket:
tensor([[1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]])
Second square bracket: tensor([1, 2, 3])
Third square bracket: 1


# : based indexing 

| Indexing        | What it means                              |
| --------------- | ------------------------------------------ |
| `:`             | All values in that dimension               |
| `0`             | Just the first item                        |
| `:` + `,` combo | Helps you slice across multiple dimensions |
| `x[:, 1]`       | All rows, column 1                         |
| `x[0, :]`       | Row 0, all columns                         |
| `x[:, :, 0]`    | All blocks, all rows, column 0             |


🧠 What : Can Do Alone
The colon : is shorthand for:
start:stop:step
Where:
start is the starting index (default = 0)
stop is the ending index (excluded)
step is how much to jump (default = 1)


x = torch.tensor([10, 20, 30, 40, 50, 60])


| Code      | Meaning                          | Output                     |
| --------- | -------------------------------- | -------------------------- |
| `x[:]`    | All elements                     | `[10, 20, 30, 40, 50, 60]` |
| `x[::2]`  | Every 2nd element (step = 2)     | `[10, 30, 50]`             |
| `x[1:]`   | From index 1 to end              | `[20, 30, 40, 50, 60]`     |
| `x[:4]`   | From start to index 4 (excluded) | `[10, 20, 30, 40]`         |
| `x[::-1]` | Reverse the tensor               | `[60, 50, 40, 30, 20, 10]` |


# Advanced Tensor Operations

## Broadcasting
🔑 What is Broadcasting?

Broadcasting allows tensors with different shapes to be used together in arithmetic operations (like +, -, *, /) without explicitly reshaping them.

| A.shape   | B.shape   | A op B Valid? | Result shape |
| --------- | --------- | ------------- | ------------ |
| (5, 3)    | (1, 3)    | ✅             | (5, 3)       |
| (3,)      | (5, 3)    | ✅             | (5, 3)       |
| (1, 4, 3) | (7, 1, 3) | ✅             | (7, 4, 3)    |
| (5, 3, 2) | (3, 2)    | ✅             | (5, 3, 2)    |
| (3, 1, 4) | (2, 4)    | ❌             | ❌            |

## Rule Summary:
####  Align shapes from the rightmost dimension.
####  Dimensions must be either equal or 1.
####  If one is 1, it is broadcast (repeated) to match the other.


In [50]:
a = torch.rand(4, 3, 2)   # Shape: (4, 3, 2)
b = torch.rand(1, 3, 1)   # Shape: (1, 3, 1)

result = a + b
print(result.shape)  # ➜ (4, 3, 2)

# Why?
# a: (4, 3, 2)
# b: (1, 3, 1) → can stretch to match (4, 3, 2)
# Insight: Singleton dimensions are "stretchable" in broadcasting.

torch.Size([4, 3, 2])


In [54]:
a = torch.rand(4, 3, 2)
b = torch.rand(4, 2, 3)

# a + b  # RuntimeError!


# Why? Shapes from right:
# (2) vs (3) → mismatch
# (3) vs (2) → mismatch
# No rule allows this.

#### 🎯 Example: Apply bias in multi-head attention

In [56]:
# Input from attention
x = torch.rand(32, 8, 128)  # (batch, heads, dim)

# Learnable bias for each head
bias = torch.rand(8, 128)   # (heads, dim)

# Automatically broadcast along batch dimension
x = x + bias  # Result: (32, 8, 128)
x.shape

torch.Size([32, 8, 128])

#### ✨ 🔸 Level 3: System-Level + Manual Broadcasting
🧠 How PyTorch Broadcasting Actually Works (Under the Hood)

Step-by-step rules:

Given two shapes A and B:

- Right-align dimensions.
- For each dim (right to left):
    - If equal, OK.
    - If one is 1, expand it.
    - Else, Error.

PyTorch expands the smaller tensor without copying data.


#### 🔧 Manually Forcing Broadcasting (Power user)

In [None]:
# .unsqueeze() to align dimensions

x = torch.rand(64, 10)     # (64, 10)
w = torch.rand(10)         # (10,) → needs to be (1, 10)

# Manual broadcast
p = x + w.unsqueeze(0)         # (64, 10)

p.shape


torch.Size([64, 10])

### 🔧 expand() vs repeat() — Key Differences

| Feature           | `expand()`                   | `repeat()`                       |
| ----------------- | ---------------------------- | -------------------------------- |
| Copies memory?    | ❌ No (view-based)            | ✅ Yes (real copy)                |
| Speed             | ⚡ Fast                       | 🐢 Slower                        |
| Memory usage      | 🧠 Efficient                 | 💾 Uses more memory              |
| Use case          | For safe broadcasting        | For full tiling (repeat content) |
| Shape restriction | Only works if dimension is 1 | Works on any shape               |


In [60]:
# ✅ expand() — stretch without copying
# Use when:
# You want to simulate broadcasting manually.


x = torch.tensor([[1], [2], [3]])  # Shape: (3, 1)

x_exp = x.expand(3, 4)            # Expand to (3, 4)
print(x_exp)


tensor([[1, 1, 1, 1],
        [2, 2, 2, 2],
        [3, 3, 3, 3]])


In [None]:
# 🧠 Rule:
# Only expands along dimensions where original size is 1.
# The new size must be equal or greater, and the old size must be 1 or same.


x = torch.rand(2, 1, 3)
x.expand(2, 4, 3)  # ✅ works (1 → 4)
x.expand(3, 4, 3)  # ❌ fails (2 ≠ 3)


In [61]:
# ✅ repeat() — full content duplication
# Use when:
# You want to tile the content — actually make a bigger tensor with copied data.

x = torch.tensor([[1], [2], [3]])  # (3, 1)

x_rep = x.repeat(1, 4)             # Repeat 4 times along columns
print(x_rep)


tensor([[1, 1, 1, 1],
        [2, 2, 2, 2],
        [3, 3, 3, 3]])


In [None]:
x = torch.tensor([1, 2, 3])  # Shape: (3,)
x.repeat(2)                  # Repeats entire tensor 2 times → (6,)
x

tensor([1, 2, 3])

In [66]:
x = torch.tensor([[1, 2], [3, 4]])  # (2, 2)
x.repeat(2, 3)                      # Repeat rows 2× and columns 3× → (4, 6)


tensor([[1, 2, 1, 2, 1, 2],
        [3, 4, 3, 4, 3, 4],
        [1, 2, 1, 2, 1, 2],
        [3, 4, 3, 4, 3, 4]])

In [81]:
x = torch.tensor([[1], [2], [3]])  # Shape: (3,1)
# Make it (3, 4, 2) using unsqueeze + expand

y = x.unsqueeze(-1)
y.shape

torch.Size([3, 1, 1])

In [80]:
y = y.expand(3,4,2)
y.shape

torch.Size([3, 4, 2])

In [None]:
x = torch.tensor([[1], [2], [3]])
x.size()
torch.Size([3, 1])
x.expand(3, 4)
# tensor([[ 1,  1,  1,  1],
#         [ 2,  2,  2,  2],
#         [ 3,  3,  3,  3]])
x.expand(-1, 4)   # -1 means not changing the size of that dimension
# tensor([[ 1,  1,  1,  1],
#         [ 2,  2,  2,  2],
#         [ 3,  3,  3,  3]])

### 🔹 repeat_interleave() = element-level repetition

In [82]:
x = torch.tensor([1, 2, 3])
x_repint = x.repeat_interleave(2)
print(x_repint)


tensor([1, 1, 2, 2, 3, 3])



### 🧠 Key Intuition
- expand is like saying: "Hey PyTorch, just pretend it's bigger by repeating along axis — don't actually do it."
- repeat is: "Make actual copies in memory. I need a real big tensor now."
- repeat_interleave is: "Repeat individual elements, not the whole structure."
