# Sklearn

In [39]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import add_dummy_feature

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [49]:
# Data

# Continuous
np.random.seed(42)  
m = 100  
X = 2 * np.random.rand(m, 1)  
y = 4 + 3 * X + np.random.randn(m, 1)  
X_b = add_dummy_feature(X)  
X_new = np.array([[0], [2]])

# Categorical
iris = load_iris(as_frame=True)
X = iris.data[["petal length (cm)", "petal width (cm)"]].values
y_bin = (iris.target == 2) 
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
# pattern is
# model()
# model.fit(X_train, y_train)
# model.predict(X_train)

In [47]:
lin_reg = LinearRegression()
lin_reg.fit(X, y)
pred_lin = lin_reg.predict(X_new)

ridge_reg = Ridge(alpha=100, solver="cholesky")
ridge_reg.fit(X, y)
pred_ridge = ridge_reg.predict(X_new)

lasso_reg = Lasso(alpha=100/(2*len(X)))
lasso_reg.fit(X, y)
pred_lasso = lasso_reg.predict(X_new)

In [65]:
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X, y) # allows for multi-class
y_pred = log_reg.predict(X)
cm = confusion_matrix(y, y_pred) 
cm

array([[50,  0,  0],
       [ 0, 47,  3],
       [ 0,  2, 48]])

In [67]:
scaler = StandardScaler()
svm_clf1 = LinearSVC(C=1, max_iter=10_000, dual=True, random_state=42) # linear kernel, can use SVC for rbf kernel
# model_loss = C*classification_loss + penalty, so lower C means higher regularization
scaled_svm_clf1 = make_pipeline(scaler, svm_clf1)
scaled_svm_clf1.fit(X, y) # allows for multi-class
scaled_svm_clf1.predict(X)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [57]:
tree_clf = DecisionTreeClassifier(max_depth=2, random_state=42)
tree_clf.fit(X, y)
tree_clf.predict(X)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2,
       2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

# NN Skeleton

In [1]:
import numpy as np
from tqdm.notebook import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data

## Answer

In [140]:
class XORDataset(data.Dataset):

    def __init__(self, size, std=0.1):
        super().__init__()
        self.size = size
        self.std = std
        self.generate_continuous_xor()

    def generate_continuous_xor(self):
        data = torch.randint(low=0, high=2, size=(self.size, 2), dtype=torch.float32)
        label = (data.sum(dim=1) == 1).to(torch.long)
        data += self.std * torch.randn(data.shape)

        self.data = data
        self.label = label

    def __len__(self):
        return self.size

    def __getitem__(self, idx):
        data_point = self.data[idx]
        data_label = self.label[idx]
        return data_point, data_label

In [5]:
class SimpleClassifier(nn.Module):

    def __init__(self, num_inputs, num_hidden, num_outputs, act_fn = nn.Tanh()):
        super().__init__()
        self.linear1 = nn.Linear(num_inputs, num_hidden)
        self.act_fn = act_fn
        self.linear2 = nn.Linear(num_hidden, num_outputs)

    # One layer and no activation for glm
    def forward(self, x):
        x = self.linear1(x)
        x = self.act_fn(x)
        x = self.linear2(x)
        return x

In [7]:
def train_model(model, optimizer, data_loader, loss_module, num_epochs=100):
    model.train()
    for epoch in tqdm(range(num_epochs)):
        for data_inputs, data_labels in data_loader:
            data_inputs = data_inputs.to(device)
            data_labels = data_labels.to(device)
            preds = model(data_inputs)
            preds = preds.reshape(-1,)
            loss = loss_module(preds, data_labels.float())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

In [9]:
def eval_model(model, data_loader):
    model.eval() 
    true_preds, num_preds = 0., 0.

    with torch.no_grad(): 
        for data_inputs, data_labels in data_loader:
            data_inputs, data_labels = data_inputs.to(device), data_labels.to(device)
            preds = model(data_inputs)
            pred_labels = (torch.sigmoid(preds) > 0.5).int().reshape(-1)
            true_preds += (pred_labels == data_labels).sum().float()
            num_preds += data_labels.shape[0]

    acc = true_preds / num_preds
    print(f"Accuracy of the model: {100.0*acc:4.2f}%")

In [11]:
device = torch.device("mps") if torch.mps.is_available() else torch.device("cpu")
print("Device", device)
def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.mps.manual_seed(seed)

Device mps


In [25]:
set_seed(1)
# Change this to MSE for linear regression
# Change this to CrossEntropyLoss for multi-category, but rmb to change num_outputs to number of classes
loss_module = nn.BCEWithLogitsLoss()
train_dataset = XORDataset(size=2500)
train_data_loader = data.DataLoader(train_dataset, batch_size=128, shuffle=True)
test_dataset = XORDataset(size=500)
test_data_loader = data.DataLoader(test_dataset, batch_size=128, shuffle=False, drop_last=False)
model = SimpleClassifier(num_inputs=2, num_hidden=4, num_outputs=1)
model.to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
train_model(model, optimizer, train_data_loader, loss_module)

# Save model
state_dict = model.state_dict()
print(state_dict)
# torch.save(state_dict, "our_model.tar")
# state_dict = torch.load("our_model.tar")
# new_model = SimpleClassifier(num_inputs=2, num_hidden=4, num_outputs=1)
# new_model.load_state_dict(state_dict)

eval_model(model, test_data_loader)

  0%|          | 0/100 [00:00<?, ?it/s]

OrderedDict({'linear1.weight': tensor([[-0.6186, -1.2413],
        [ 2.4791,  2.3084],
        [-1.9578,  2.8000],
        [ 3.2432, -2.6608]], device='mps:0'), 'linear1.bias': tensor([ 1.3746, -0.7270,  0.7655,  1.2364], device='mps:0'), 'linear2.weight': tensor([[ 2.0459,  3.4998, -3.6495, -4.2671]], device='mps:0'), 'linear2.bias': tensor([0.8146], device='mps:0')})
Accuracy of the model: 100.00%


  nonzero_finite_vals = torch.masked_select(


In [None]:
# define dataset
# define dataloaders
# define model
# define device and set seed
# define optimizer
# define loss module
# model.to(device)
# model.train()
# for each epoch
# for data_input, data_label in data_loader:
# predict
# loss
# zero
# loss.backward
# optimizer.step

## Exercise

In [50]:
class XORDataset(data.Dataset):
    def __init__(self, size, std = 0.1):
        super().__init__()
        self.size = size
        self.data = torch.randint(2, size = (size, 2), dtype = torch.float32)
        self.targets = (self.data.sum(axis = 1) == 1).to(torch.float32)
        self.data += torch.randn(self.data.shape)*std
        
    def __len__(self):
        return self.size

    def __getitem__(self, index):
        return self.data[index], self.targets[index]

In [52]:
class SimpleClassifier(nn.Module):
    
    def __init__(self, num_inputs, num_hidden, num_classes, act_fn = nn.Tanh()):
        super().__init__()
        self.linear1 = nn.Linear(num_inputs, num_hidden)
        self.act_fn = act_fn
        self.linear2 = nn.Linear(num_hidden, num_classes)

    def forward(self, x):
        x = self.linear1(x)
        x = self.act_fn(x)
        x = self.linear2(x)
        return x

In [54]:
def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.mps.manual_seed(seed)

In [78]:
epochs = 100
set_seed(42)
train_dataset = XORDataset(2500)
train_data_loader = data.DataLoader(train_dataset, batch_size = 128, shuffle = True)
model = SimpleClassifier(2, 4, 1)
device = torch.device("mps") if torch.mps.is_available() else torch.device("cpu")
print(device)
loss_fn = nn.BCEWithLogitsLoss()
model.to(device)
optimizer = torch.optim.SGD(model.parameters(), lr = 0.1)
model.train()

mps


SimpleClassifier(
  (linear1): Linear(in_features=2, out_features=4, bias=True)
  (act_fn): Tanh()
  (linear2): Linear(in_features=4, out_features=1, bias=True)
)

In [204]:
epochs = 100
set_seed(42)
train_dataset = XORDataset(2500)
train_data_loader = data.DataLoader(train_dataset, batch_size = 128, shuffle = True)
model = SimpleClassifier(2, 4, 1)
device = torch.device("mps") if torch.mps.is_available() else torch.device("cpu")
print(device)
loss_fn = nn.BCEWithLogitsLoss()
model.to(device)
optimizer = torch.optim.SGD(model.parameters(), lr = 0.1)
model.train()
for epoch in range(epochs):
    sum_correct = 0
    sum_data_points = 0
    for data_input, data_label in train_data_loader:
        data_input = data_input.to(device)
        data_label = data_label.to(device)
        pred = model(data_input).flatten()
        loss = loss_fn(pred, data_label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        sum_data_points += len(data_input)
        sum_correct += ((pred > 0) == data_label).sum()
    print(np.round((100*sum_correct/sum_data_points).cpu().item(), 2))

mps
50.32
48.6
47.96
46.28
48.96
47.64
49.64
49.96
50.08
49.36
51.8
50.72
51.64
51.96
52.12
51.36
56.36
51.52
50.8
52.16
50.92
51.32
51.2
50.76
51.44
50.8
52.32
55.16
62.76
78.48
89.4
97.4
99.4
99.8
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0


# Activations

In [252]:
class ActivationFunction(nn.Module):
    def __init__(self):
        super().__init__()
        self.name = self.__class__.__name__

class Identity(ActivationFunction):
    def forward(self, x):
        return x

class ReLU(ActivationFunction):
    def forward(self, x):
        return x * (x > 0)

class Tanh(ActivationFunction):
    def forward(self, x):
        return (torch.exp(x) - torch.exp(-x))/(torch.exp(x) + torch.exp(-x))

class Sigmoid(ActivationFunction):
    def forward(self, x):
        return 1/(1 + torch.exp(-x))

class SoftPlus(ActivationFunction):
    def forward(self, x):
        return torch.log(1 + np.exp(x))

class ELU(ActivationFunction):
    def __init__(self, alpha = 1):
        super().__init__()
        self.alpha = alpha
    
    def forward(self, x):
        return torch.where(x >= 0, x, self.alpha*(torch.exp(x)-1))

class GeLU(ActivationFunction):
    def forward(self, x):
        return 0.5*x*(1 + torch.tanh(torch.tensor(2/torch.pi)*(x + 0.044715*x**3)))

class SiLU(ActivationFunction):
    def forward(self, x):
        return x/(1 + torch.exp(-x))

# Initializations

In [14]:
class BaseNetwork(nn.Module): 
    
    def __init__(self, act_fn = nn.ReLU(), input_size=784, num_classes=10, hidden_sizes=[512, 256, 256, 128]):
        super().__init__()
        
        layers = []
        layer_sizes = [input_size] + hidden_sizes
        for layer_index in range(1, len(layer_sizes)):
            layers += [nn.Linear(layer_sizes[layer_index-1], layer_sizes[layer_index]),
                       act_fn]
        layers += [nn.Linear(layer_sizes[-1], num_classes)]
        self.layers = nn.Sequential(*layers) 
        self._init_weights()

    def _init_weights(self):
        for name, param in self.named_parameters():
            if name.endswith("bias"):
                nn.init.zeros_(param)
            else:
                # nn.init.normal_(param, std = np.sqrt(2/(param.shape[0] + param.shape[1]))) # xavier
                nn.init.normal_(param, std = np.sqrt(2/param.shape[1])) # kaiming
        
    def forward(self, x):
        logits = self.layers(x)
        return logits 

# Optimizations

In [134]:
class OptimizerTemplate:
    def __init__(self, params, lr):
        self.params = list(params)
        self.lr = lr

    def zero_grad(self):
        for p in self.params:
            if p.grad is not None:
                p.grad.detach_()
                p.grad.zero_()

    @torch.no_grad()
    def step(self):
        for p in self.params:
            if p.grad is not None:
                self.update_param(p)
            
    def update_param(self, p):
        raise NotImplementedError

In [136]:
class SGD(OptimizerTemplate):
    def update_param(self, p):
        p_update = -self.lr*p.grad
        p.add_(p_update)

In [138]:
class SGDMomentum(OptimizerTemplate):
    def __init__(self, params, lr, momentum = 0.0):
        super().__init__()
        self.beta1 = momentum
        self.param_momentum = {p : torch.zeros_like(p.data) for p in self.params}
    
    def update_param(self, p):
        self.param_momentum[p] = p.grad + self.beta1*self.param_momentum[p]
        p_update = -self.lr * self.param_momentum[p]
        p.add_(p_update)

In [140]:
class AdaGrad(OptimizerTemplate):
    def __init__(self, params, lr, epsilon = 1e-8):
        super().__init__()
        self.epsilon = epsilon
        self.param_sq_grad_sum = {p : torch.zeros_like(p.data) for p in self.params}

    def update_param(self, p):
        self.param_sq_grad_sum[p].add_(p.grad**2)
        p_update = -self.lr * p.grad / torch.sqrt(self.param_sq_grad_sum[p] + self.epsilon)
        p.add_(p_update)

In [142]:
class RMSProp(OptimizerTemplate):
    def __init__(self, params, lr, epsilon = 1e-8, beta2 = 0.999):
        super().__init__()
        self.beta2 = beta2
        self.epsilon = epsilon
        self.param_sq_grad_sum = {p : torch.zeros_like(p.data) for p in self.params}

    def update_param(self, p):
        self.param_sq_grad_sum[p] = self.beta2*self.param_sq_grad_sum[p] + (1-self.beta2)*(p.grad**2)
        p_update = -self.lr * p.grad / torch.sqrt(self.param_sq_grad_sum[p] + self.epsilon)
        p.add_(p_update)

In [146]:
class AdaDelta(OptimizerTemplate):
    def __init__(self, params, lr = 1.0, epsilon = 1e-8, beta2 = 0.999): 
        super().__init__()
        self.beta2 = beta2
        self.epsilon = epsilon
        self.param_delta = {p : torch.zeros_like(p.data) for p in self.params}
        self.param_sq_grad_sum = {p : torch.zeros_like(p.data) for p in self.params}

    def update_param(self, p):
        self.param_sq_grad_sum[p] = self.beta2*self.param_sq_grad_sum[p] + (1-self.beta2)*(p.grad**2)
        ada_lr = torch.sqrt(self.param_delta[p] + self.epsilon)
        p_update = -self.lr * ada_lr * p.grad / torch.sqrt(self.param_sq_grad_sum[p] + self.epsilon)
        p.add_(p_update)
        self.param_delta[p] = self.beta2*self.param_delta[p] + (1-self.beta2)*((p_update/self.lr)**2)

In [148]:
class Adam(OptimizerTemplate):
    def __init__(self, params, lr, epsilon = 1e-8, beta1 = 0.99, beta2 = 0.999):
        super().__init__()
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.param_step = {p : 0 for p in self.params}
        self.param_momentum = {p : torch.zeros_like(p.data) for p in self.params}
        self.param_sq_grad_sum = {p : torch.zeros_like(p.data) for p in self.params}

    def update_param(self, p):
        self.param_step[p] += 1
        self.param_momentum[p] = self.beta1*self.param_momentum[p] + (1-self.beta1)*p.grad
        self.param_sq_grad_sum[p] = self.beta2*self.param_sq_grad_sum[p] + (1-self.beta2)*(p.grad**2)
        beta1_norm = 1 - self.beta1**self.param_step[p]
        beta2_norm = 1 - self.beta2**self.param_step[p]
        p_update = -self.lr * (self.param_momentum[p]/beta1_norm) / (torch.sqrt(self.param_sq_grad_sum[p]/beta2_norm) + self.epsilon)
        p.add_(p_update)

# Self-Attention / Cross-Attention / Multi-Head Attention / Grouped Attention

# GPT-2