# AdaGrad

In [1]:
import numpy as np

In [2]:
class AdaGradOptimizer:
  def __init__(self, learning_rate=0.01, epsilon=1e-8):
    self.learning_rate = learning_rate
    self.epsilon = epsilon
    self.G = None   # sum of squared gradients

  def update(self, params, grads):
    if self.G is None:
      self.G = np.zeros_like(params)

    self.G += grads ** 2
    adjusted_lr = self.learning_rate / (np.sqrt(self.G) + self.epsilon)
    params -= adjusted_lr * grads
    return params

In [3]:
# Example usage:
params = np.array([1.0, 2.0])  # Initial parameters
grads = np.array([0.1, 0.2])  # Example gradients

optimizer = AdaGradOptimizer(learning_rate=0.1)
params = optimizer.update(params, grads)
print("Updated parameters:", params)

Updated parameters: [0.90000001 1.9       ]


# AdaDelta

In [4]:
import numpy as np

class AdaDeltaOptimizer:
  def __init__(self, rho=0.95, epsilon=1e-6):
    self.rho = rho
    self.epsilon = epsilon
    self.E_g2 = None
    self.E_delta_theta2 = None

  def update(self, params, grads):
    if self.E_g2 is None:
      self.E_g2 = np.zeros_like(params)
      self.E_delta_theta2 = np.zeros_like(params)

    self.E_g2 = self.rho * self.E_g2 + (1-self.rho) * params ** 2
    delta_theta = -np.sqrt(self.E_delta_theta2 + self.epsilon)/np.sqrt(self.E_g2 + self.epsilon) * grads

    self.E_delta_theta2 = self.rho * self.E_delta_theta2 + (1-self.rho) * delta_theta ** 2

    params += delta_theta
    return params

In [5]:
# Example usage:
params = np.array([1.0, 2.0])  # Initial parameters
grads = np.array([0.1, 0.2])  # Example gradients

optimizer = AdaDeltaOptimizer(rho=0.95)
params = optimizer.update(params, grads)
print("Updated parameters:", params)

Updated parameters: [0.99955279 1.99955279]


# Exploring the PyTorch library in contrast to TensorFlow

In [6]:
import torch

points = torch.tensor([1.0, 2.0, 4.6, 8.9, 10.12])

In [7]:
points.shape

torch.Size([5])

In [8]:
points.storage()

  points.storage()


 1.0
 2.0
 4.599999904632568
 8.899999618530273
 10.119999885559082
[torch.storage.TypedStorage(dtype=torch.float32, device=cpu) of size 5]

In [9]:
# usage of storage_offset


base_tensor = torch.arange(10)
base_tensor

tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [10]:
slice_tensor = base_tensor[3:]

In [11]:
# Check storage offsets
print("Base Tensor Storage Offset:", base_tensor.storage_offset())
print("Slice Tensor Storage Offset:", slice_tensor.storage_offset())

Base Tensor Storage Offset: 0
Slice Tensor Storage Offset: 3


In [12]:
# STRIDE

tensor = torch.tensor([[1, 2, 3],
                       [4, 5, 6]])

In [13]:
tensor.shape

torch.Size([2, 3])

In [14]:
tensor.stride()

(3, 1)

Stride (3, 1) means:
Moving along dimension 0 (rows) requires jumping 3 elements (because it's row-major storage).
Moving along dimension 1 (columns) requires jumping 1 element (adjacent elements in a row).


## PyTorch Modules

In [15]:
# torch.nn

import torch.nn as nn

class SimpleNN(nn.Module):
  def __init__(self):
    super(SimpleNN, self).__init__()
    self.fc1 = nn.Linear(4, 16)
    self.relu = nn.ReLU()
    self.fc2 = nn.Linear(16, 3)

  def forward(self, x):
    x = self.fc1(x)
    x = self.relu(x)
    x = self.fc2(x)
    return x

In [16]:
model = SimpleNN()
print(model)

SimpleNN(
  (fc1): Linear(in_features=4, out_features=16, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=16, out_features=3, bias=True)
)


In [18]:
# torch.optim

import torch.optim as optim

model = nn.Linear(10, 2)

lr = 0.01
opt = optim.SGD(model.parameters(), lr=lr)

criterion = nn.MSELoss()

x = torch.randn(5, 10)   # Batch of 5 samples, 10 features each
y = torch.randn(5, 2)    # Target values

predictions = model(x)

loss = criterion(predictions, y)

opt.zero_grad() # Reset previous gradients
loss.backward()
opt.step()

In [19]:
# torch.utils.data

In [20]:
# dataset
import torch
from torch.utils.data import Dataset

class SampleDataset(Dataset):
  def __init__(self):
    self.data = torch.arange(10)

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    return self.data[idx]


dataset = SampleDataset()

print(len(dataset))

print(dataset[3])

10
tensor(3)


In [22]:
# DataLoader

from torch.utils.data import DataLoader

dataloader = DataLoader(dataset, batch_size=3, shuffle=True)


for batch in dataloader:
  print(batch)

tensor([5, 3, 7])
tensor([8, 6, 2])
tensor([4, 1, 0])
tensor([9])


In [28]:
# TensorDataset

from torch.utils.data import TensorDataset

X = torch.arange(10).float().reshape(-1, 1)
y = (X > 5).float() # Labels

dataset = TensorDataset(X, y)

dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

for batch in dataloader:
  print(batch)

[tensor([[1.],
        [3.]]), tensor([[0.],
        [0.]])]
[tensor([[0.],
        [4.]]), tensor([[0.],
        [0.]])]
[tensor([[2.],
        [9.]]), tensor([[0.],
        [1.]])]
[tensor([[8.],
        [5.]]), tensor([[1.],
        [0.]])]
[tensor([[6.],
        [7.]]), tensor([[1.],
        [1.]])]


In [30]:
# subset

from torch.utils.data import Subset

subset = Subset(dataset, indices=[0, 2, 4])
dl = DataLoader(subset)

In [33]:
for x in dl:
  print(x)

[tensor([[0.]]), tensor([[0.]])]
[tensor([[2.]]), tensor([[0.]])]
[tensor([[4.]]), tensor([[0.]])]


In [35]:
# RandomSampler

from torch.utils.data import RandomSampler

sampler = RandomSampler(dataset, replacement=True, num_samples=5)
dataloader = DataLoader(dataset, batch_size=2, sampler=sampler)

for z in dataloader:
  print(z)

[tensor([[3.],
        [2.]]), tensor([[0.],
        [0.]])]
[tensor([[6.],
        [6.]]), tensor([[1.],
        [1.]])]
[tensor([[0.]]), tensor([[0.]])]
