# Experiment - what features does a NN learn?

Hypotheses:
H1: Even on simple problems, we will see that certain features are learned far more quickly than others.
Things which may affect which features are learned:
- [DONE] H1A: Simplicity -> adding 2 nums rather than 1 slows down learning, but doesn't affect convergence. Adding an extra linear layer for the embedding doesn't hurt.
- H1B: Number of features
- H1C: Presence of worse but easier features
- [DONE] H1D: Size of features --> Conclusion: matters, but only at extremes (e.g. .01x + 10)
- H1E: Continuity of features

GOAL:
   - Understand what types of features get picked up on (write them out)
   - Have an idea for how to change the Teachable exps to get better grounding.
   - Have an idea for a full version of the Teachable exps.
   - Determine whether it's a real thing (here) that if a feature is harder to learn it won't get picked up on.
   - Determine whether less informative features get ignored or not.
   - Make better logs for this
   - Add collapsibility here

Pos is fairly large! (up to 13!!) Scale factor is 15. Consdier encoding pos in the same way as Waypoint

SOMETHING FUNKY IS GOING ON WITH ANT WAYPOINTS!!! (all starts are 0, .067)
>> Consider trying the thing where we place the maze in a particular loc

# Setup

In [7]:
import os
import torch
from torch import nn
import torch.nn.functional as F
from torchvision.datasets import MNIST
from torch.utils.data import DataLoader, random_split, Dataset
from torchvision import transforms
import pytorch_lightning as pl
import numpy as np
import torch.nn as nn

In [59]:
class NN(pl.LightningModule):

    def __init__(self, input_dim, hidden_dim=8):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim), 
            nn.ReLU(), 
            nn.Linear(hidden_dim, 1)
        )

    def forward(self, x):
        out = self.model(x)
        return out

    def training_step(self, batch, batch_idx):
        # training_step defines the train loop. It is independent of forward
        x, y = batch
        pred = self.model(x)
        loss = F.mse_loss(pred, y)
        self.log('train_loss', loss)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer
    
class SepHeadsNN(NN):
    def __init__(self, input_dim, hidden_dim=8):
        assert input_dim == 2
        self.input_preprocess = nn.Linear(1, 8)
        self.model = nn.Sequential(
            nn.Linear(9, hidden_dim), 
            nn.ReLU(), 
            nn.Linear(hidden_dim, 1)
        )
        SepHeadsNN
    def forward(self, x):
        input1 = self.input_preprocess(x[:, :1])
        full_input = torch.cat([input1, x[:, 1:]], dim=1)
        return self.model(full_input)
    
    def training_step(self, batch, batch_idx):
        # training_step defines the train loop. It is independent of forward
        x, y = batch
        input1 = self.input_preprocess(x[:, :1])
        full_input = torch.cat([input1, x[:, 1:]], dim=1)
        pred = self.model(full_input)
        loss = F.mse_loss(pred, y)
        self.log('train_loss', loss)
        return loss

In [124]:
from torch.utils.data import DataLoader, random_split, Dataset

class BaseDataset(Dataset):
    
    def __len__(self):
        return 1000
    
    def get_x(self, y):
        raise NotImplementedError
        
    def get_x_dim(self):
        raise NotImplementedError
    
    def __getitem__(self, idx):
        y = torch.randn(1)
        x = self.get_x(y)
        return x.cuda(), y.cuda()

class ConstantDataset(BaseDataset):
    def get_x(self, y):
        return y.clone()
    
    def get_x_dim(self):
        return 1
    
class ConstantScaleBigDataset(BaseDataset):
    def get_x(self, y):
        return y * 10
    
    def get_x_dim(self):
        return 1
    
class ConstantScaleHighDataset(BaseDataset):
    def get_x(self, y):
        return y + 10
    
    def get_x_dim(self):
        return 1
    
class ConstantScaleSmallDataset(BaseDataset):
    def get_x(self, y):
        return y / 10
    
    def get_x_dim(self):
        return 1

class ScaleShiftDataset(BaseDataset):
    def __init__(self, scale, shift):
        self.scale = scale
        self.shift = shift
        
    def get_x(self, y):
        return (y + self.shift) * self.scale
    
    def get_x_dim(self):
        return 1
    
def make_scale_shift(scale, shift):
    def make_dataset():
        return ScaleShiftDataset(scale, shift)
    return make_dataset
    
class SumDataset(BaseDataset):
    def get_x(self, y):
        x1 = torch.randn(1)
        x2 = y - x1
        return torch.cat([x1, x2])
    
    def get_x_dim(self):
        return 2
    
class SumDataset15(BaseDataset):
    def get_x(self, y):
        x1 = torch.randn(1) * 15
        x2 = y - x1
        return torch.cat([x1, x2])
    
    def get_x_dim(self):
        return 2
    
class DistractorDataset(BaseDataset):
    def __init__(self, num_distractors):
        self.num_distractors = num_distractors
        
    def get_x(self, y):
        x = torch.randn(self.num_distractors)
        x[0] = y
        return x
    
    def get_x_dim(self):
        return self.num_distractors

def make_distractor_dataset(num_distractors):
    def make_dataset():
        return DistractorDataset(num_distractors)
    return make_dataset
    
class DoubleFeatureDataset(BaseDataset):
    def get_x(self, y):
        x1 = y.clone()
        x2 = -y - 1
        return torch.cat([x1, x2])
    
    def get_x_dim(self):
        return 2
    
class DoubleFeatureMistakeDataset(BaseDataset):
    def get_x(self, y):
        x1 = y.clone()
        x2 = -y.clone()
        if np.random.uniform() < .2:
            x2 = torch.randn(1)
        return torch.cat([x1, x2])
    
    def get_x_dim(self):
        return 2
     
        
class DoubleFeatureNoiseDataset(BaseDataset):
    def get_x(self, y):
        x1 = y + torch.randn(1) / 5
        x2 = y.clone()
        return torch.cat([x1, x2])
    
    def get_x_dim(self):
        return 2
    

class SumDistractorDataset(BaseDataset):
    def get_x(self, y):
        x1 = y + torch.randn(1) / 5
        x2 = torch.randn(1)
        x3 = y - x2
        return torch.cat([x1, x2, x3])
    
    def get_x_dim(self):
        return 3  
    
class ContinuityDataset(BaseDataset):
    def get_x(self, y):
        first = y < 0
        if first:
            x1 = y.clone()
            x2 = torch.randn(1)
        else:
            x1 = torch.randn(1)
            x2 = y.clone()
        return torch.cat([first.int(), x1, x2])
    
    def get_x_dim(self):
        return 3
    
def run_exp(class_name, exp_name=None, max_epochs=5):
    if exp_name is None:
        exp_name = class_name.__name__
    print("running experiment", exp_name)
    dataset = class_name()
    train, val = random_split(dataset, [800, 200])

    model = NN(dataset.get_x_dim())
    logger = pl.loggers.TensorBoardLogger(f'logs/{exp_name}')
    trainer = pl.Trainer(max_epochs=max_epochs, logger=logger)
    temp = trainer.fit(model, DataLoader(train), DataLoader(val))
    return model

# RUN ALL

In [38]:
# run_exp(ConstantDataset) # Very fast, perfect convergence
# run_exp(ConstantScaleBigDataset) # big spike at the beginning, perfect convergence
# run_exp(ConstantScaleHighDataset) # doesn't converge
# run_exp(ConstantScaleSmallDataset) # doesn't converge
# run_exp(SumDataset) # Learns more slowly than constant, perfect convergence
# run_exp(DistractorDataset) # Learns more slowly than constant, perfect convergence
# run_exp(DoubleFeatureDataset) # Learns a bit more slowly than constant, perfect convergence
# run_exp(DoubleFeatureNoiseDataset) # learns much more slowly, perfect convergence
# run_exp(SumDistractorDataset) # learns much more slowly, perfect convergence
# run_exp(ContinuityDataset) # Doesn't converge (although it might converge later)


GPU available: True, used: False
TPU available: False, using: 0 TPU cores

  | Name  | Type       | Params
-------------------------------------
0 | model | Sequential | 41    
-------------------------------------
41        Trainable params
0         Non-trainable params
41        Total params
0.000     Total estimated model params size (MB)


running experiment ContinuityDataset


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…




# H1D - Size of Features


- Results for loop over scale (tl;dr): small magnitude (.01) prevents convergence, huge magnitude (10) leads to early spike, other than that OK. .5-5 is best!
- Results for loop over shift: big shift (5,10) prevents convergence, others are OK.
- Check the input scales we see with Waypoint vs OffsetWaypoint. --> all pretty reasonable (max is .27 to .86, depending on level, for Waypoint. Offset is a bit bigger.)
- Try a Teachable exp with Waypoint/OffsetWaypoint but different scales; see if this affects grounding.
- Consider positional encoding.

In [49]:
# # Loop over scales
# for scale in [.01, .05, .1, .5, 1, 2, 5, 10]:
#     class_name = make_scale_shift(scale, 0)
#     run_exp(class_name, exp_name=f"scale_{scale}_shift_0", max_epochs=10)
    
# # Loop over shifts
# for shift in [0, 1, 2, 5, 10]:
#     class_name = make_scale_shift(1, shift)
#     run_exp(class_name, exp_name=f"scale_1_shift_{shift}", max_epochs=10)


# H1A - Simplicity of Features

* Results: when features are more complex, they take longer to learn
* Results: eventually converge to the same place.

In [48]:
# run_exp(SumDataset15, max_epochs=10) # Learns more slowly than constant, perfect convergencee
# run_exp(SumDataset) # Learns more slowly than constant, perfect convergence

GPU available: True, used: False
TPU available: False, using: 0 TPU cores

  | Name  | Type       | Params
-------------------------------------
0 | model | Sequential | 33    
-------------------------------------
33        Trainable params
0         Non-trainable params
33        Total params
0.000     Total estimated model params size (MB)


running experiment SumDataset15


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…




In [60]:
dataset = SumDataset15()
train, val = random_split(dataset, [800, 200])
model = SepHeadsNN(dataset.get_x_dim())
exp_name = 'sep_heads_exp'
logger = pl.loggers.TensorBoardLogger(f'logs/{exp_name}')
trainer = pl.Trainer(max_epochs=10, logger=logger)
temp = trainer.fit(model, DataLoader(train), DataLoader(val))

GPU available: True, used: False
TPU available: False, using: 0 TPU cores

  | Name             | Type       | Params
------------------------------------------------
0 | model            | Sequential | 89    
1 | input_preprocess | Linear     | 16    
------------------------------------------------
105       Trainable params
0         Non-trainable params
105       Total params
0.000     Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…





# H1B - Number of Features

In [68]:
# Loop over num distractors
for num_distractors in [1, 10, 20, 50, 100, 1000]:
    class_name = make_distractor_dataset(num_distractors)
    run_exp(class_name, exp_name=f"num_distractors_{num_distractors}", max_epochs=20)
    

GPU available: True, used: False
TPU available: False, using: 0 TPU cores

  | Name  | Type       | Params
-------------------------------------
0 | model | Sequential | 25    
-------------------------------------
25        Trainable params
0         Non-trainable params
25        Total params
0.000     Total estimated model params size (MB)


running experiment num_distractors_1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…




GPU available: True, used: False
TPU available: False, using: 0 TPU cores

  | Name  | Type       | Params
-------------------------------------
0 | model | Sequential | 97    
-------------------------------------
97        Trainable params
0         Non-trainable params
97        Total params
0.000     Total estimated model params size (MB)



running experiment num_distractors_10


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

GPU available: True, used: False
TPU available: False, using: 0 TPU cores

  | Name  | Type       | Params
-------------------------------------
0 | model | Sequential | 177   
-------------------------------------
177       Trainable params
0         Non-trainable params
177       Total params
0.001     Total estimated model params size (MB)



running experiment num_distractors_20


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

GPU available: True, used: False
TPU available: False, using: 0 TPU cores

  | Name  | Type       | Params
-------------------------------------
0 | model | Sequential | 417   
-------------------------------------
417       Trainable params
0         Non-trainable params
417       Total params
0.002     Total estimated model params size (MB)



running experiment num_distractors_50


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

GPU available: True, used: False
TPU available: False, using: 0 TPU cores

  | Name  | Type       | Params
-------------------------------------
0 | model | Sequential | 817   
-------------------------------------
817       Trainable params
0         Non-trainable params
817       Total params
0.003     Total estimated model params size (MB)



running experiment num_distractors_100


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

GPU available: True, used: False
TPU available: False, using: 0 TPU cores

  | Name  | Type       | Params
-------------------------------------
0 | model | Sequential | 8.0 K 
-------------------------------------
8.0 K     Trainable params
0         Non-trainable params
8.0 K     Total params
0.032     Total estimated model params size (MB)



running experiment num_distractors_1000


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…




# H1C - Presence of easier but worse features

- Determine how to save model
- Determine how to print weights
- Determine how to run model on a new dataset
- Figure out which feature(s) are used in DoubleFeature, DoubleFeatureNoise, and SumNoise

In [103]:
two_good_model = run_exp(DoubleFeatureDataset, max_epochs=20)
class_name = make_distractor_dataset(2)
one_good_one_bad_model = run_exp(class_name, max_epochs=20)
one_good_one_noisy_model = run_exp(DoubleFeatureNoiseDataset, max_epochs=20)
two_good_hard_one_noisy_model = run_exp(SumDistractorDataset, max_epochs=20)

GPU available: True, used: False
TPU available: False, using: 0 TPU cores

  | Name  | Type       | Params
-------------------------------------
0 | model | Sequential | 33    
-------------------------------------
33        Trainable params
0         Non-trainable params
33        Total params
0.000     Total estimated model params size (MB)


running experiment DoubleFeatureDataset


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

GPU available: True, used: False
TPU available: False, using: 0 TPU cores

  | Name  | Type       | Params
-------------------------------------
0 | model | Sequential | 33    
-------------------------------------
33        Trainable params
0         Non-trainable params
33        Total params
0.000     Total estimated model params size (MB)



running experiment make_dataset


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

GPU available: True, used: False
TPU available: False, using: 0 TPU cores

  | Name  | Type       | Params
-------------------------------------
0 | model | Sequential | 33    
-------------------------------------
33        Trainable params
0         Non-trainable params
33        Total params
0.000     Total estimated model params size (MB)



running experiment DoubleFeatureNoiseDataset


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

GPU available: True, used: False
TPU available: False, using: 0 TPU cores

  | Name  | Type       | Params
-------------------------------------
0 | model | Sequential | 41    
-------------------------------------
41        Trainable params
0         Non-trainable params
41        Total params
0.000     Total estimated model params size (MB)



running experiment SumDistractorDataset


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…




In [125]:
two_good_model2 = run_exp(DoubleFeatureDataset, max_epochs=20) # Now same scale
two_good_hard_one_noisy_model = run_exp(SumDistractorDataset, max_epochs=20) # Now same scale

GPU available: True, used: False
TPU available: False, using: 0 TPU cores

  | Name  | Type       | Params
-------------------------------------
0 | model | Sequential | 33    
-------------------------------------
33        Trainable params
0         Non-trainable params
33        Total params
0.000     Total estimated model params size (MB)


running experiment DoubleFeatureDataset


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

GPU available: True, used: False
TPU available: False, using: 0 TPU cores

  | Name  | Type       | Params
-------------------------------------
0 | model | Sequential | 41    
-------------------------------------
41        Trainable params
0         Non-trainable params
41        Total params
0.000     Total estimated model params size (MB)



running experiment SumDistractorDataset


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…




In [104]:
exp_names = ['2 Good', # both features used; seems piecewise linear. All pieces get used.
             '1 Good 1 Bad', # bad feature barely used
             '1 Good 1 Noisy', 
             '2 Good+Hard 1 Noisy']
exp_models = [two_good_model, one_good_one_bad_model, one_good_one_noisy_model, two_good_hard_one_noisy_model]
for exp_name, model in zip(exp_names, exp_models):
    print("EXPERIMENT:", exp_name)
    for name, param in model.named_parameters():
        print(name, param.shape)
        print(np.round(param.detach().cpu().numpy(), 2))
    print("=" * 60)

EXPERIMENT: 2 Good
model.0.weight torch.Size([8, 2])
[[ 0.77 -0.45]
 [ 0.39 -0.26]
 [-0.93 -0.22]
 [ 0.91 -0.48]
 [ 0.61  0.47]
 [ 0.69  0.64]
 [-0.9   0.79]
 [ 0.41  0.04]]
model.0.bias torch.Size([8])
[ 0.05  0.37  0.6   0.07 -0.46 -0.62  0.04 -0.55]
model.2.weight torch.Size([1, 8])
[[ 0.4   0.02 -0.52  0.57 -0.31  0.14 -0.7  -0.05]]
model.2.bias torch.Size([1])
[-0.1]
EXPERIMENT: 1 Good 1 Bad
model.0.weight torch.Size([8, 2])
[[ 0.01  0.19]
 [-1.13  0.07]
 [ 0.59 -0.03]
 [ 0.69  0.12]
 [-0.64 -0.12]
 [ 0.62 -0.08]
 [ 0.23 -0.07]
 [ 0.4  -0.03]]
model.0.bias torch.Size([8])
[-0.65  1.13 -0.6   0.93 -0.86 -0.61 -0.23  1.37]
model.2.weight torch.Size([1, 8])
[[-0.13 -0.5   0.7   0.37 -0.4   0.28 -0.09  0.43]]
model.2.bias torch.Size([1])
[-0.37]
EXPERIMENT: 1 Good 1 Noisy
model.0.weight torch.Size([8, 2])
[[-0.41  0.44]
 [-1.17 -0.84]
 [ 0.19 -0.12]
 [-0.27 -0.67]
 [ 0.54 -0.61]
 [-0.27  0.08]
 [ 0.73  0.78]
 [-0.23  0.46]]
model.0.bias torch.Size([8])
[-0.36  0.11 -0.39  0.08  0.4   

In [132]:
exp_names = ['2 Good', # both features used; seems piecewise linear. All pieces get used.
             # Weird thing is the gradients don't sum correctly...look into this more!
             '1 Good 1 Bad', # bad feature barely used
             '1 Good 1 Noisy', ] # piecewise linear; on a few pieces both get used, 
#              '2 Good+Hard 1 Noisy'] # both used
exp_models = [two_good_model2, one_good_one_bad_model, one_good_one_noisy_model,]
# two_good_hard_one_noisy_model]
for exp_name, model in zip(exp_names, exp_models):
    print("EXPERIMENT:", exp_name)
    y = torch.randn((20, 1))
    x = torch.randn((20, 2))
    x.requires_grad = True
    pred = model(x)
    err = pred - y
    err.sum().backward()
    print(np.round(x.grad.cpu().numpy(), 3))
    
    
    

    print("=" * 60)

EXPERIMENT: 2 Good
[[ 0.434 -0.473]
 [ 0.36  -0.547]
 [ 0.678 -0.868]
 [ 0.624 -0.377]
 [ 0.618 -0.382]
 [ 0.624 -0.377]
 [ 0.36  -0.547]
 [ 0.618 -0.382]
 [ 0.942 -0.699]
 [ 0.624 -0.377]
 [ 0.624 -0.377]
 [ 0.544 -0.456]
 [ 0.434 -0.473]
 [ 0.544 -0.456]
 [ 0.695 -0.307]
 [ 0.395 -0.414]
 [ 0.618 -0.382]
 [ 0.624 -0.377]
 [ 0.624 -0.377]
 [ 0.395 -0.414]]
EXPERIMENT: 1 Good 1 Bad
[[ 1.    -0.002]
 [ 0.999 -0.003]
 [ 1.    -0.002]
 [ 1.    -0.002]
 [ 1.    -0.002]
 [ 1.    -0.   ]
 [ 1.    -0.002]
 [ 1.    -0.002]
 [ 1.    -0.002]
 [ 1.    -0.   ]
 [ 1.    -0.002]
 [ 1.    -0.002]
 [ 1.    -0.002]
 [ 1.    -0.002]
 [ 1.    -0.002]
 [ 0.999 -0.003]
 [ 1.    -0.002]
 [ 1.    -0.002]
 [ 1.    -0.002]
 [ 1.    -0.002]]
EXPERIMENT: 1 Good 1 Noisy
[[ 0.06   0.877]
 [ 0.006  0.995]
 [ 0.049  1.13 ]
 [ 0.006  0.995]
 [ 0.465  0.483]
 [ 0.006  0.995]
 [ 0.416  0.536]
 [ 0.773  0.704]
 [ 0.006  0.995]
 [-0.31   0.782]
 [-0.002  1.003]
 [ 0.006  0.995]
 [-0.002  1.003]
 [ 0.465  0.483]
 [ 0.006 

In [135]:
exp_names = ['2 Good+Hard 1 Noisy'] # Hard-to-learn but better features are learned!
exp_models = [two_good_hard_one_noisy_model] # [noisy, random, y - random]
for exp_name, model in zip(exp_names, exp_models):
    print("EXPERIMENT:", exp_name)
    y = torch.randn((20, 1))
#     x = torch.randn((20, 3))
    x1 = y + torch.randn(20, 1) / 5
    x2 = torch.randn(20, 1)
    x3 = y - x2
    x = torch.cat([x1, x2, x3], dim=1)
    print("x shape", x.shape)
    x.requires_grad = True
    pred = model(x)
    err = pred - y
    err.sum().backward()
    print(np.round(x.grad.cpu().numpy(), 2))

EXPERIMENT: 2 Good+Hard 1 Noisy
x shape torch.Size([20, 3])
[[-0.03  1.01  1.02]
 [ 0.    1.    1.  ]
 [ 0.    1.    1.  ]
 [ 0.    1.    1.  ]
 [-0.03  1.01  1.01]
 [ 0.    1.    1.  ]
 [-0.03  1.01  1.01]
 [ 0.    1.    1.  ]
 [ 0.    1.    1.  ]
 [-0.03  1.01  1.01]
 [ 0.    1.    1.  ]
 [ 0.    1.    1.  ]
 [-0.03  1.01  1.01]
 [-0.06  1.06  1.06]
 [ 0.    1.    1.  ]
 [-0.03  1.01  1.01]
 [-0.03  1.01  1.01]
 [ 0.04  0.94  0.95]
 [ 0.    1.    1.  ]
 [-0.03  1.01  1.01]]
