In [2]:
import torch
from torch.utils.data import Dataset, DataLoader

import torch.nn as nn

In [5]:
size = (2, 3)
x = torch.randn(*size)  # Creates a tensor with normally distributed random values (mean=0, std=1)
print(x)

tensor([[ 0.2809,  0.2855,  0.1491],
        [ 1.6699, -0.0231,  0.1176]])


In [6]:
x_ones = torch.ones(*size)   # Tensor filled with ones
x_zeros = torch.zeros(*size) # Tensor filled with zeros
print("Ones tensor:", x_ones)
print("Zeros tensor:", x_zeros)

Ones tensor: tensor([[1., 1., 1.],
        [1., 1., 1.]])
Zeros tensor: tensor([[0., 0., 0.],
        [0., 0., 0.]])


In [7]:
L = [[1, 2, 3], [4, 5, 6]]
x = torch.tensor(L)  # Converts a nested list to a tensor
print(x)


tensor([[1, 2, 3],
        [4, 5, 6]])


In [8]:
y = x.clone()  # Creates a clone of tensor x
x[0, 0] = 999  # Modify x to show that y is unaffected
print("Original x:", x)
print("Cloned y:", y)

Original x: tensor([[999,   2,   3],
        [  4,   5,   6]])
Cloned y: tensor([[1, 2, 3],
        [4, 5, 6]])


In [10]:
x = torch.randn(3, 3, requires_grad=True)
print(x)
print("Before no_grad block, requires_grad:", x.requires_grad)

with torch.no_grad():
    x = x * 2
    print("Inside no_grad block, requires_grad:", x.requires_grad)

print(x)
print("After no_grad block, requires_grad:", x.requires_grad)

tensor([[-0.8734,  0.0622, -1.0020],
        [ 0.1589, -1.3428,  0.3411],
        [ 0.7378,  3.3918, -0.3597]], requires_grad=True)
Before no_grad block, requires_grad: True
Inside no_grad block, requires_grad: False
tensor([[-1.7468,  0.1244, -2.0041],
        [ 0.3179, -2.6857,  0.6823],
        [ 1.4756,  6.7835, -0.7195]])
After no_grad block, requires_grad: False


In [11]:
x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)  # Track computation history
y = x ** 2  # y = [1, 4, 9]
z = y.sum()  # z = 1 + 4 + 9 = 14

# Compute gradients
z.backward()  # Computes dz/dx
print("Gradients of x:", x.grad)

Gradients of x: tensor([2., 4., 6.])


In [12]:
x = torch.randn(2, 3, 4)  # A tensor with shape (2, 3, 4)
print(x.size())           # Returns the shape of the tensor

torch.Size([2, 3, 4])


In [3]:
a = torch.randn(2, 3)
b = torch.randn(2, 3)
c = torch.cat((a, b), dim=0)  # Concatenates along dimension 0 (rows)
print(c.size())               # Output size: (4, 3)

torch.Size([4, 3])


In [5]:
x = torch.randn(2, 3, 4)
print(x)
y = x.transpose(1, 2)  # Swaps dimensions 1 and 2
print(y)
print(y.size())        # Output size: (2, 4, 3)

tensor([[[-0.7281,  1.3693, -0.9329, -1.7440],
         [ 1.9872, -0.1911,  0.1413, -0.6456],
         [-1.2565,  0.2459,  0.5283, -0.6760]],

        [[-1.0694, -0.0727, -1.2120, -1.0388],
         [-0.8967,  0.0956,  0.2473,  2.7595],
         [ 1.3144, -0.5794, -0.4290,  0.3744]]])
tensor([[[-0.7281,  1.9872, -1.2565],
         [ 1.3693, -0.1911,  0.2459],
         [-0.9329,  0.1413,  0.5283],
         [-1.7440, -0.6456, -0.6760]],

        [[-1.0694, -0.8967,  1.3144],
         [-0.0727,  0.0956, -0.5794],
         [-1.2120,  0.2473, -0.4290],
         [-1.0388,  2.7595,  0.3744]]])
torch.Size([2, 4, 3])


In [7]:
x = torch.randn(3, 4)
print(x)
y = x.unsqueeze(1)  # Adds a dimension at index 1
print(y)
print(y.size())     # Output size: (3, 1, 4)

tensor([[-0.0239, -0.4895, -0.9352,  0.2771],
        [-0.2216,  1.3820,  0.8290,  1.5302],
        [ 0.7351,  1.1295,  0.9572, -1.0014]])
tensor([[[-0.0239, -0.4895, -0.9352,  0.2771]],

        [[-0.2216,  1.3820,  0.8290,  1.5302]],

        [[ 0.7351,  1.1295,  0.9572, -1.0014]]])
torch.Size([3, 1, 4])


In [13]:
x = torch.randn(2, 1, 3, 1)
y = x.squeeze(dim=1)  # Removes the dimension at index 1 if it is of size 1
print(y.size())       # Output size: (2, 3, 1)

torch.Size([2, 3, 1])


In [14]:
A = torch.tensor([[1, 2, 3], [4, 5, 6]])   # Shape (2, 3)
B = torch.tensor([[7, 8], [9, 10], [11, 12]])  # Shape (3, 2)
ret = A.mm(B)  # Matrix multiplication, resulting shape is (2, 2)
print(ret)

tensor([[ 58,  64],
        [139, 154]])


In [15]:
A = torch.tensor([[1, 2, 3], [4, 5, 6]])  # Shape (2, 3)
x = torch.tensor([7, 8, 9])              # Shape (3,)
ret = A.mv(x)  # Matrix-vector multiplication, resulting shape is (2,)
print(ret)

tensor([ 50, 122])


In [16]:
x = torch.tensor([[1, 2, 3], [4, 5, 6]])  # Shape (2, 3)
x_t = x.t()  # Transpose of x, resulting shape is (3, 2)
print(x_t)

tensor([[1, 4],
        [2, 5],
        [3, 6]])


In [17]:
cuda_available = torch.cuda.is_available()
print("CUDA Available:", cuda_available)

CUDA Available: False


In [18]:
x = torch.randn(3, 3)  # Tensor on CPU
if torch.cuda.is_available():
    x = x.cuda()  # Moves the tensor to GPU
    print("Tensor on GPU:", x.device)

In [20]:
import argparse

args = argparse.Namespace(disable_cuda=False)
# Use GPU if available and not disabled, else use CPU
if not args.disable_cuda and torch.cuda.is_available():
    args.device = torch.device('cuda')
else:
    args.device = torch.device('cpu')

print("Using device:", args.device)

Using device: cpu


In [23]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Create a tensor and move it to the appropriate device
x = torch.randn(4, 4).to(device)
print("Tensor is on device:", x.device)

# Define a simple model and move it to the appropriate device
model = torch.nn.Linear(4, 2).to(device)
output = model(x)
print("Output:", output)

Tensor is on device: cpu
Output: tensor([[ 0.0555,  0.0866],
        [-0.7701, -0.7741],
        [-0.3680, -0.7874],
        [ 0.6082, -0.1511]], grad_fn=<AddmmBackward0>)


In [24]:
# Define a linear layer from 4 input units to 2 output units
linear_layer = nn.Linear(4, 2)
x = torch.randn(1, 4)  # Input tensor of shape (batch_size, input_features)
output = linear_layer(x)
print("Output:", output)

Output: tensor([[-0.8256, -0.8796]], grad_fn=<AddmmBackward0>)


In [32]:
## commonly used loss functions

# L1Loss 
# L1Loss computes the mean absolute error (L1 norm) between predictions and targets.
criterion = nn.L1Loss()
pred = torch.tensor([0.0, 0.5, 1.0])
target = torch.tensor([0.0, 1.0, 0.0])
loss = criterion(pred, target)
print("L1 Loss:", loss.item())


# MSELoss 
# computes the mean squared error (L2 norm), commonly used for regression tasks
criterion = nn.MSELoss()
pred = torch.tensor([0.0, 0.5, 1.0])
target = torch.tensor([0.0, 1.0, 0.0])
loss = criterion(pred, target)
print("MSE Loss:", loss.item())


# CrossEntropyLoss
# Used for multi-class classification. It combines log_softmax and nll_loss in one function.
criterion = nn.CrossEntropyLoss()
pred = torch.tensor([[2.0, 1.0, 0.1]])  # Raw logits
target = torch.tensor([0])  # Class index
loss = criterion(pred, target)
print("Cross Entropy Loss:", loss.item())


# CTCLoss
# CTC loss is used for sequence-to-sequence problems where the output sequence length is shorter than the input sequence length, like speech to text.


# NLLLoss
# expects log-probabilities as input. It’s often used with log_softmax.
criterion = nn.NLLLoss()
log_probs = torch.log_softmax(torch.tensor([[2.0, 1.0, 0.1]]), dim=1)
target = torch.tensor([0])
loss = criterion(log_probs, target)
print("NLL Loss:", loss.item())

# PoissonNLLLoss
# Poisson Negative Log-Likelihood loss is used when the target is a count, and we want to model the Poisson distribution.
criterion = nn.PoissonNLLLoss(log_input=True)
input = torch.tensor([0.8, 0.9, 1.2, 1.5]).log()  # Log of the predicted rate
target = torch.tensor([1.0, 1.0, 1.0, 1.0])  # Target count
loss = criterion(input, target)
print("Poisson NLL Loss:", loss.item())


# KLDivLoss
# Kullback-Leibler divergence loss is used for measuring the difference between two probability distributions.
criterion = nn.KLDivLoss(reduction='batchmean')
input = torch.log(torch.tensor([0.2, 0.5, 0.3]))  # Log of predicted probabilities
target = torch.tensor([0.1, 0.6, 0.3])  # Target probabilities
loss = criterion(input, target)
print("KL Divergence Loss:", loss.item())


# BCELoss
# Used for binary classification problems. The input should be probabilities between 0 and 1.
criterion = nn.BCELoss()
pred = torch.tensor([0.8, 0.2, 0.1])
target = torch.tensor([1.0, 0.0, 0.0])
loss = criterion(pred, target)
print("BCE Loss:", loss.item())

# BCEWithLogitsLoss
# A numerically stable version of BCELoss that applies a sigmoid function internally.
criterion = nn.BCEWithLogitsLoss()
logits = torch.tensor([1.5, -1.0, -0.5])
target = torch.tensor([1.0, 0.0, 0.0])
loss = criterion(logits, target)
print("BCEWithLogits Loss:", loss.item())


# MarginRankingLoss
# This loss is used for comparing pairs of inputs and measuring the relative ranking.
criterion = nn.MarginRankingLoss(margin=1.0)
input1 = torch.tensor([0.2, 0.5, 1.0])
input2 = torch.tensor([0.3, 0.4, 0.9])
target = torch.tensor([1.0, -1.0, 1.0])  # 1 means input1 > input2, -1 means input1 < input2
loss = criterion(input1, input2, target)
print("Margin Ranking Loss:", loss.item())


# HingeEmbeddingLoss
# This loss is used for metric learning, often in tasks such as similarity-based ranking.
criterion = nn.HingeEmbeddingLoss(margin=1.0)
input = torch.tensor([0.6, -0.8, 1.0])
target = torch.tensor([1.0, -1.0, 1.0])  # 1.0 if similar, -1.0 if dissimilar
loss = criterion(input, target)
print("Hinge Embedding Loss:", loss.item())


# MultiLabelMarginLoss
# Used for multi-class, multi-label classification where labels are unordered.
criterion = nn.MultiLabelMarginLoss()
input = torch.tensor([[0.5, 1.0, -0.2], [0.1, -0.5, 0.3]])
target = torch.tensor([[0, 1, 0], [0, 0, 1]])  # Multi-label target
loss = criterion(input, target)
print("MultiLabel Margin Loss:", loss.item())


# SmoothL1Loss
# A combination of L1 and L2 loss, less sensitive to outliers than MSELoss.
criterion = nn.SmoothL1Loss()
pred = torch.tensor([0.0, 0.5, 1.0])
target = torch.tensor([0.0, 1.0, 0.0])
loss = criterion(pred, target)
print("Smooth L1 Loss:", loss.item())


# SoftMarginLoss
# SoftMarginLoss is used for binary classification with logistic regression.
criterion = nn.SoftMarginLoss()
input = torch.tensor([0.6, -0.4, 0.8])
target = torch.tensor([1.0, -1.0, 1.0])  # Target class labels (1 or -1)
loss = criterion(input, target)
print("Soft Margin Loss:", loss.item())


# MultiLabelSoftMarginLoss
# Similar to SoftMarginLoss but for multi-label classification tasks.
criterion = nn.MultiLabelSoftMarginLoss()
input = torch.tensor([[0.6, -0.4], [0.2, 0.7]])
target = torch.tensor([[1.0, 0.0], [0.0, 1.0]])  # Multi-label targets
loss = criterion(input, target)
print("MultiLabel Soft Margin Loss:", loss.item())


# CosineEmbeddingLoss
# Used to measure the cosine similarity between two tensors.



# MultiMarginLoss
# Used for multi-class classification with a margin to control misclassification.
criterion = nn.MultiMarginLoss()
input = torch.tensor([[0.5, -0.8, 1.0], [0.2, 0.5, -0.4]])
target = torch.tensor([2, 1])  # Class indices
loss = criterion(input, target)
print("Multi Margin Loss:", loss.item())


# TripletMarginLoss
# used for metric learning. It encourages the distance between an anchor and a positive example to be smaller than the distance between the anchor and a negative example by a margin.
criterion = nn.TripletMarginLoss(margin=1.0)
anchor = torch.tensor([1.0, 2.0])
positive = torch.tensor([1.1, 2.1])
negative = torch.tensor([3.0, 4.0])
loss = criterion(anchor, positive, negative)
print("Triplet Margin Loss:", loss.item())

L1 Loss: 0.5
MSE Loss: 0.4166666567325592
Cross Entropy Loss: 0.4170299470424652
NLL Loss: 0.4170299470424652
Poisson NLL Loss: 1.0351793766021729
KL Divergence Loss: 0.013359417207539082
BCE Loss: 0.18388253450393677
BCEWithLogits Loss: 0.329584002494812
Margin Ranking Loss: 1.0333333015441895
Hinge Embedding Loss: 1.1333333253860474
MultiLabel Margin Loss: 0.7999999523162842
Smooth L1 Loss: 0.2083333283662796
Soft Margin Loss: 0.4405346214771271
MultiLabel Soft Margin Loss: 0.5379570722579956
Multi Margin Loss: 0.21666666865348816
Triplet Margin Loss: 0.0


In [33]:
# common activation functions

# ReLU
#The ReLU (Rectified Linear Unit) activation function replaces all negative values in the input tensor with zero, while leaving positive values unchanged:
# f(x) = max(0, x).
relu = nn.ReLU()
x = torch.tensor([-1.0, 0.0, 1.0, 2.0])
output = relu(x)
print("ReLU Output:", output)

# ELU (Exponential Linear Unit) introduces negative values for inputs less than zero, making it smoother than ReLU
elu = nn.ELU(alpha=1.0)
x = torch.tensor([-1.0, 0.0, 1.0])
output = elu(x)
print("ELU Output:", output)

# LeakyReLU allows a small, non-zero gradient when the input is negative:
# f(x) = x if x > 0, negative_slope * x if x <= 0.
leaky_relu = nn.LeakyReLU(negative_slope=0.01)
x = torch.tensor([-1.0, 0.0, 1.0, 2.0])
output = leaky_relu(x)
print("Leaky ReLU Output:", output)

# GELU (Gaussian Error Linear Unit) is a smooth, differentiable version of ReLU, commonly used in transformer models.
gelu = nn.GELU()
x = torch.tensor([-1.0, 0.0, 1.0])
output = gelu(x)
print("GELU Output:", output)

# Sigmoid squashes the input to a range between 0 and 1, making it useful for binary classification.
sigmoid = nn.Sigmoid()
x = torch.tensor([-1.0, 0.0, 1.0])
output = sigmoid(x)
print("Sigmoid Output:", output)

# Tanh squashes the input to a range between -1 and 1, often used in hidden layers of neural networks.
tanh = nn.Tanh()
x = torch.tensor([-1.0, 0.0, 1.0])
output = tanh(x)
print("Tanh Output:", output)

ReLU Output: tensor([0., 0., 1., 2.])
ELU Output: tensor([-0.6321,  0.0000,  1.0000])
Leaky ReLU Output: tensor([-0.0100,  0.0000,  1.0000,  2.0000])
GELU Output: tensor([-0.1587,  0.0000,  0.8413])
Sigmoid Output: tensor([0.2689, 0.5000, 0.7311])
Tanh Output: tensor([-0.7616,  0.0000,  0.7616])


In [36]:
# Optimizers

import torch
import torch.optim as optim

# Create a model (simple example)
model = torch.nn.Linear(10, 2)

# Example with Adam optimizer
opt = optim.Adam(model.parameters(), lr=0.001)  # Adam optimizer with learning rate 0.001
opt.step()  # Update model weights
opt.zero_grad()  # Clear gradients after each step

# Example with SGD (Stochastic Gradient Descent) optimizer
opt = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)  # SGD with momentum
opt.step()  # Update model weights
opt.zero_grad()  # Clear gradients

# SGD: Stochastic Gradient Descent, can have momentum to help accelerate convergence.
# Adam: A combination of momentum and adaptive learning rates. AdamW is a variant that includes weight decay.
# Adagrad, Adadelta: Adagrad adapts the learning rate for each parameter. Adadelta is an improved version that reduces the need to set a learning rate manually.
# RMSprop: RMSprop is an adaptive learning rate optimizer, often used for training RNNs.
# NAdam: A combination of Adam and Nesterov momentum.
# RAdam: A rectified version of Adam that fixes some of Adam's issues with adaptive learning rates.
# LBFGS: Limited-memory Broyden–Fletcher–Goldfarb–Shanno, used for small datasets or when high precision is required.

In [38]:
import torch.optim as optim

# Create a simple model
model = torch.nn.Linear(10, 2)

# Example optimizer (Adam)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# LambdaLR - Custom learning rate schedule based on a lambda function
scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda epoch: 0.95 ** epoch)
scheduler.step()  # Update the learning rate after optimizer step

# LambdaLR: The learning rate is updated according to a lambda function. You can customize this function to create a specific learning rate schedule.
# MultiplicativeLR: The learning rate is updated by multiplying it with a factor defined in lr_lambda.
# StepLR: The learning rate is decayed by a fixed factor (gamma) after a certain number of epochs (step_size).
# MultiStepLR: Similar to StepLR, but allows you to specify exact epoch milestones where the learning rate should decay.
# ExponentialLR: The learning rate is updated by a factor of gamma at each epoch.
# CosineAnnealingLR: The learning rate decreases following a cosine curve, reaching a minimum value (eta_min).
# ReduceLROnPlateau: It reduces the learning rate when a specific metric (like validation loss) stops improving.
# CyclicLR: The learning rate cycles between a base_lr and max_lr over a number of steps, which can help escape local minima.
# OneCycleLR: This scheduler is a variant of CyclicLR, where the learning rate first increases to max_lr and then decreases back down, following a specific "one cycle" pattern.
# CosineAnnealingWarmRestarts: It allows the learning rate to restart after a specified number of epochs, with each restart followed by a cosine annealing pattern.

In [39]:
import torch
from torch.utils.data import TensorDataset

# Create some dummy data and labels
data = torch.randn(100, 3)  # 100 samples, each with 3 features
labels = torch.randint(0, 2, (100,))  # 100 labels (binary classification)

# Create TensorDataset
dataset = TensorDataset(data, labels)

# Access a sample from the dataset
sample = dataset[0]  # Returns a tuple (data[0], labels[0])
print(sample)

(tensor([ 0.2998,  0.1806, -0.0758]), tensor(0))


In [40]:
from torch.utils.data import ConcatDataset, TensorDataset
import torch

# Create two simple datasets
data1 = torch.randn(5, 3)  # 5 samples, 3 features
labels1 = torch.randint(0, 2, (5,))
dataset1 = TensorDataset(data1, labels1)

data2 = torch.randn(3, 3)  # 3 samples, 3 features
labels2 = torch.randint(0, 2, (3,))
dataset2 = TensorDataset(data2, labels2)

# Concatenate the datasets
combined_dataset = ConcatDataset([dataset1, dataset2])

# Access a sample from the concatenated dataset
sample = combined_dataset[6]  # Returns the 7th sample (index 6) from the concatenated dataset
print(sample)

(tensor([-1.2187,  0.8016,  0.8722]), tensor(1))


In [41]:
from torch.utils.data import DataLoader, TensorDataset
import torch

# Create some dummy data
data = torch.randn(10, 3)  # 10 samples, 3 features
labels = torch.randint(0, 2, (10,))  # 10 labels for binary classification
dataset = TensorDataset(data, labels)

# Create a DataLoader
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# Iterating over the DataLoader
for batch_data, batch_labels in dataloader:
    print(batch_data, batch_labels)

tensor([[ 0.1131, -1.7908,  0.4184],
        [ 0.3694,  1.7906, -1.6865]]) tensor([0, 1])
tensor([[-2.6673, -1.5842, -0.0741],
        [ 0.5427, -0.3060, -1.6506]]) tensor([0, 0])
tensor([[ 0.2763, -0.2456, -0.5315],
        [ 0.5851, -0.6923,  0.2732]]) tensor([0, 1])
tensor([[ 0.2269,  0.2125, -1.6735],
        [ 0.6308,  0.7755,  0.0099]]) tensor([0, 1])
tensor([[-0.5852, -1.4539,  0.5168],
        [ 0.1615,  0.7091, -0.1030]]) tensor([0, 1])
