In [2]:
import pandas as pd
import numpy as np
import torch.nn as nn
from torch.utils.data import Dataset
import torch.nn.functional as F
import ast
from joblib import dump, load
import torch
from torch.utils.data import DataLoader


import torch.nn.functional as F
from torch.nn import (
    Sequential as Seq,
    Linear as Lin,
    ReLU,
    BatchNorm1d,
    AvgPool1d,
    Sigmoid,
    Conv1d,
)

from deepsetmodel import *

if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

testset = pd.read_csv('../dataset/paired_testset.csv', sep=',',index_col=0)


In [3]:
class Data(Dataset):
    def __init__(self,df,feature = 'tfdif', support_pipe = '../pipes/support-tfidf.joblib', opposition_pipe = '../pipes/oppose-tfidf.joblib', both =False, both_pipe = '../pipes/both-tfidf.joblib'):
        self.df = df    
        supports = self.df['support'].values
        oppositions = self.df['opposition'].values
        self.y = self.df['outcome'].values 
        # convert list of stings to list of lists of stings
        supports = list(map(lambda x: ast.literal_eval(x), supports))
        oppositions = list(map(lambda x: ast.literal_eval(x), oppositions))

        self.max_len_brief = max(self.findMaxLen(supports),self.findMaxLen(oppositions))

        if feature == 'tfdif':
            if both == False:
                support_pipe = load(support_pipe)
                opposition_pipe = load(opposition_pipe)
                getSupport = lambda x: self.stringsToTfidfs(x,support_pipe)
                getOpposition = lambda x: self.stringsToTfidfs(x,opposition_pipe)
            else:
                both_pipe = load(both_pipe)
                getSupport = lambda x: self.stringsToTfidfs(x,both_pipe)
                getOpposition = lambda x: self.stringsToTfidfs(x,both_pipe)

            self.supports = list(map( getSupport, supports))
            self.oppositions = list(map( getOpposition, oppositions))

        elif feature == 'embedding':
            self.supports = []
            self.oppositions = []
    
    def __len__(self):
        return len(self.supports)
    
    def __getitem__(self, idx):
        y = 1.0 if self.y[idx] == 'grant' else 0.0

        return self.supports[idx] , self.oppositions[idx] , y
    
    def findMaxLen(self,x):
        max_len = 0
        for i in range(len(x)):
            row = x[i]
            if len(row) > max_len:
                max_len = len(row)
        return max_len

    def stringsToTfidfs(self,briefs,pipe):
        tfidfs = torch.tensor(pipe.transform(briefs).toarray(),dtype=torch.float32)

        num_padding = self.max_len_brief - tfidfs.shape[0]

        padding = nn.ConstantPad2d((0, 0, 0, num_padding), 0)

        tfidfs = padding(tfidfs)

        tfidfs = tfidfs.T
        return tfidfs
    
    def stringsToEmbeddings(self): 
        pass

train_data = Data(testset[testset['data_type'] == 'train'], both = True)
test_data = Data(testset[testset['data_type'] == 'test'], both = True) 

batch_size = 8

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

The input size for TFIDF is quite big, so instead of doubling the feauture size of the hidden layers.
I simplely added it by 100 and 200 units respectively.

How do i construct the input?
- Should i have two different sets?
- Or chug them all in the same set?


I can not load multple models, because the TFIDF vectors creates large weights which in turn makes the model large

There might be a way to mitigate this problem with:

https://pytorch.org/docs/stable/generated/torch.sparse_coo_tensor.html

In [4]:


# TFIDF is quiet big so i may have to reduce the hiden layers width
input_size = train_data.supports[0].shape[0]
max_len_brief = train_data.max_len_brief

# the latent space has the be atleast the size of the input

models = {}
optimizers = {}


# hidden1 = int(input_size /5)
# hidden2 = int(hidden1 / 4)
# hidden3 = int(hidden2 / 3)
# classify1 = int(hidden3 /2)

#models["support"] = DeepSets(input_size, max_len_brief , hidden1, hidden2, hidden3, classify1).to(device)

#models["opposition"] = DeepSets(input_size,max_len_brief,  hidden1, hidden2, hidden3, classify1).to(device)


latent_size = int(input_size / 10)
hidden_size = latent_size
output_size =  1


models["both"] = MultiSetTransformer(input_size, latent_size, hidden_size, output_size ).to(device)

## what does Bachnorm and conv1d work?

#optimizers["suppport"] = torch.optim.Adam(models["support"].parameters(), lr=1e-4)
#optimizers["opposition"] = torch.optim.Adam(models["opposition"].parameters(), lr=1e-2)
optimizers["both"] = torch.optim.Adam(models["both"].parameters(), lr=1e-2)

In [5]:
loader =iter(train_loader)
supports, oppositions, y = next(loader)
supports = supports.to(device)
oppositions = oppositions.to(device)
y = y.float()
y = y.reshape(-1,1)
y = y.to(device)

outputs= models["both"](supports, oppositions)
loss_fn= nn.BCELoss()

RuntimeError: stack expects each tensor to be equal size, but got [8, 8, 1865] at entry 0 and [8, 8, 2] at entry 4

In [5]:
proj = nn.Linear(input_size, latent_size).to(device)


In [6]:
X ,Y = supports, oppositions
X = X.reshape(X.shape[0], X.shape[2], X.shape[1])
Y = Y.reshape(Y.shape[0], Y.shape[2], Y.shape[1])
ZX , ZY = models["both"].proj(X) , models["both"].proj(Y)

models["both"].enc((ZX, ZY))

RuntimeError: stack expects each tensor to be equal size, but got [8, 8, 1865] at entry 0 and [8, 8, 2] at entry 4

In [21]:
models["both"].forward(X,Y).shape

RuntimeError: stack expects each tensor to be equal size, but got [8, 8, 1865] at entry 0 and [8, 8, 2] at entry 4

### Define training, validation, testing data generators

In [4]:
from tqdm.notebook import tqdm

@torch.no_grad()
def test(model, loader, total, batch_size, leave=False , datatype='support', loss_fn= nn.BCELoss()):
    
    model.eval()

    sum_loss = 0.0
    sum_acc = 0.0

    t = tqdm(enumerate(loader), total=total /batch_size, leave=leave)

    for i, data in t:

        supports, oppositions, y = data
        supports = supports.to(device)
        oppositions = oppositions.to(device)

        y = y.float()
        y = y.reshape(-1,1)
        y = y.to(device)

        if datatype == 'support':
            outputs= model(supports)
        elif datatype == 'opposition':
            outputs= model(oppositions)
        elif datatype == 'both':
            outputs= model(supports, oppositions)

        loss = loss_fn(outputs, y)

        acc = ((outputs > 0.5) == y).sum().item()
        sum_acc += acc
        avg_acc =  acc /batch_size
        
        sum_loss += loss.item()

        t.set_description(f"batch_loss_{datatype}: {loss.item():.4f} \t| sum_loss_{datatype}: {sum_loss:.4f}\n batch_accuracy_{datatype}: {avg_acc:.4f}")
        
        t.refresh()
    # what is the (i+1) for?
        
    return sum_loss  / len(loader.dataset) , sum_acc / len(loader.dataset)


def train(model, optimizer, loader, total, batch_size, leave=False, datatype='support', loss_fn= nn.BCELoss()):
    model.train()

    sum_loss = 0.0
    t = tqdm(enumerate(loader), total=total /batch_size, leave=leave)
    for i, data in t:


        supports, oppositions, y = data
        supports = supports.to(device)
        oppositions = oppositions.to(device)
        y = y.float()
        y = y.reshape(-1,1)
        y = y.to(device)

        optimizer.zero_grad()

        if datatype == 'support':
            outputs= model(supports)
        elif datatype == 'opposition':
            outputs= model(oppositions)
        elif datatype == 'both':
            outputs= model(supports, oppositions)

        loss = loss_fn(outputs, y)
        sum_loss += loss.item()

        loss.backward()

        optimizer.step()

        t.set_description(f"batch_loss_{datatype}: {loss.item():.4f} \t| sum_loss_{datatype}: {sum_loss:.4f}")
        t.refresh()

    return sum_loss / len(loader.dataset)

### Train

In [6]:
import os.path as osp

n_epochs = 30
stale_epochs = 0
best_valid_loss = 99999
patience = 15
t = tqdm(range(0, n_epochs))

for epoch in t:
    avg_loss = train(
        model=models["support"], 
        optimizer=optimizers["suppport"], 
        loader=train_loader, 
        total=len(train_data), 
        batch_size=batch_size, 
        leave=bool(epoch == n_epochs - 1) 
    )
    
    
    valid_loss, valid_acc = test(
        model=models["support"],
        loader=test_loader, 
        total=len(test_data), 
        batch_size=batch_size, 
        leave=bool(epoch == n_epochs - 1),
        datatype='support'
    )

    print("Epoch: {:02d}, Training Loss:   {:.4f}".format(epoch, avg_loss))
    print("           Validation Loss: {:.4f}".format(valid_loss))
    print("           Validation Accuracy: {:.4f}".format(valid_acc))

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        # modpath = osp.join("deepsets_best.pth")
        # print("New best model saved to:", modpath)
        # torch.save(model.state_dict(), modpath)
        stale_epochs = 0
    else:
        print("Stale epoch")
        stale_epochs += 1
    if stale_epochs >= patience:
        print("Early stopping after %i stale epochs" % patience)
        break



  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/31.25 [00:00<?, ?it/s]

  0%|          | 0/31.25 [00:00<?, ?it/s]

Epoch: 00, Training Loss:   0.0091
           Validation Loss: 0.1247
           Validation Accuracy: 0.6080


  0%|          | 0/31.25 [00:00<?, ?it/s]

  0%|          | 0/31.25 [00:00<?, ?it/s]

Epoch: 01, Training Loss:   0.0134
           Validation Loss: 0.1210
           Validation Accuracy: 0.5320


  0%|          | 0/31.25 [00:00<?, ?it/s]

  0%|          | 0/31.25 [00:00<?, ?it/s]

Epoch: 02, Training Loss:   0.0174
           Validation Loss: 0.1081
           Validation Accuracy: 0.6240


  0%|          | 0/31.25 [00:00<?, ?it/s]

  0%|          | 0/31.25 [00:00<?, ?it/s]

Epoch: 03, Training Loss:   0.0132
           Validation Loss: 0.1328
           Validation Accuracy: 0.5960
Stale epoch


  0%|          | 0/31.25 [00:00<?, ?it/s]

  0%|          | 0/31.25 [00:00<?, ?it/s]

Epoch: 04, Training Loss:   0.0102
           Validation Loss: 0.1658
           Validation Accuracy: 0.6120
Stale epoch


  0%|          | 0/31.25 [00:00<?, ?it/s]

  0%|          | 0/31.25 [00:00<?, ?it/s]

Epoch: 05, Training Loss:   0.0063
           Validation Loss: 0.1641
           Validation Accuracy: 0.6240
Stale epoch


  0%|          | 0/31.25 [00:00<?, ?it/s]

  0%|          | 0/31.25 [00:00<?, ?it/s]

Epoch: 06, Training Loss:   0.0163
           Validation Loss: 0.1390
           Validation Accuracy: 0.5920
Stale epoch


  0%|          | 0/31.25 [00:00<?, ?it/s]

  0%|          | 0/31.25 [00:00<?, ?it/s]

Epoch: 07, Training Loss:   0.0078
           Validation Loss: 0.1190
           Validation Accuracy: 0.6320
Stale epoch


  0%|          | 0/31.25 [00:00<?, ?it/s]

  0%|          | 0/31.25 [00:00<?, ?it/s]

Epoch: 08, Training Loss:   0.0069
           Validation Loss: 0.1270
           Validation Accuracy: 0.6120
Stale epoch


  0%|          | 0/31.25 [00:00<?, ?it/s]

  0%|          | 0/31.25 [00:00<?, ?it/s]

Epoch: 09, Training Loss:   0.0173
           Validation Loss: 0.1939
           Validation Accuracy: 0.4760
Stale epoch


  0%|          | 0/31.25 [00:00<?, ?it/s]

  0%|          | 0/31.25 [00:00<?, ?it/s]

Epoch: 10, Training Loss:   0.0146
           Validation Loss: 0.1042
           Validation Accuracy: 0.6200


  0%|          | 0/31.25 [00:00<?, ?it/s]

  0%|          | 0/31.25 [00:00<?, ?it/s]

Epoch: 11, Training Loss:   0.0131
           Validation Loss: 0.1798
           Validation Accuracy: 0.6040
Stale epoch


  0%|          | 0/31.25 [00:00<?, ?it/s]

  0%|          | 0/31.25 [00:00<?, ?it/s]

Epoch: 12, Training Loss:   0.0107
           Validation Loss: 0.1326
           Validation Accuracy: 0.5720
Stale epoch


  0%|          | 0/31.25 [00:00<?, ?it/s]

  0%|          | 0/31.25 [00:00<?, ?it/s]

Epoch: 13, Training Loss:   0.0185
           Validation Loss: 0.1174
           Validation Accuracy: 0.5840
Stale epoch


  0%|          | 0/31.25 [00:00<?, ?it/s]

  0%|          | 0/31.25 [00:00<?, ?it/s]

Epoch: 14, Training Loss:   0.0156
           Validation Loss: 0.1169
           Validation Accuracy: 0.6000
Stale epoch


  0%|          | 0/31.25 [00:00<?, ?it/s]

  0%|          | 0/31.25 [00:00<?, ?it/s]

Epoch: 15, Training Loss:   0.0183
           Validation Loss: 0.1385
           Validation Accuracy: 0.5600
Stale epoch


  0%|          | 0/31.25 [00:00<?, ?it/s]

  0%|          | 0/31.25 [00:00<?, ?it/s]

Epoch: 16, Training Loss:   0.0073
           Validation Loss: 0.2113
           Validation Accuracy: 0.4960
Stale epoch


  0%|          | 0/31.25 [00:00<?, ?it/s]

  0%|          | 0/31.25 [00:00<?, ?it/s]

Epoch: 17, Training Loss:   0.0050
           Validation Loss: 0.1222
           Validation Accuracy: 0.6080
Stale epoch


  0%|          | 0/31.25 [00:00<?, ?it/s]

  0%|          | 0/31.25 [00:00<?, ?it/s]

Epoch: 18, Training Loss:   0.0082
           Validation Loss: 0.1319
           Validation Accuracy: 0.5880
Stale epoch


  0%|          | 0/31.25 [00:00<?, ?it/s]

  0%|          | 0/31.25 [00:00<?, ?it/s]

Epoch: 19, Training Loss:   0.0091
           Validation Loss: 0.1255
           Validation Accuracy: 0.5920
Stale epoch


  0%|          | 0/31.25 [00:00<?, ?it/s]

  0%|          | 0/31.25 [00:00<?, ?it/s]

Epoch: 20, Training Loss:   0.0059
           Validation Loss: 0.1398
           Validation Accuracy: 0.5720
Stale epoch


  0%|          | 0/31.25 [00:00<?, ?it/s]

  0%|          | 0/31.25 [00:00<?, ?it/s]

Epoch: 21, Training Loss:   0.0153
           Validation Loss: 0.1400
           Validation Accuracy: 0.5680
Stale epoch


  0%|          | 0/31.25 [00:00<?, ?it/s]

  0%|          | 0/31.25 [00:00<?, ?it/s]

Epoch: 22, Training Loss:   0.0133
           Validation Loss: 0.1372
           Validation Accuracy: 0.5880
Stale epoch


  0%|          | 0/31.25 [00:00<?, ?it/s]

  0%|          | 0/31.25 [00:00<?, ?it/s]

Epoch: 23, Training Loss:   0.0050
           Validation Loss: 0.1323
           Validation Accuracy: 0.5880
Stale epoch


  0%|          | 0/31.25 [00:00<?, ?it/s]

  0%|          | 0/31.25 [00:00<?, ?it/s]

Epoch: 24, Training Loss:   0.0037
           Validation Loss: 0.1489
           Validation Accuracy: 0.5640
Stale epoch


  0%|          | 0/31.25 [00:00<?, ?it/s]

  0%|          | 0/31.25 [00:00<?, ?it/s]

Epoch: 25, Training Loss:   0.0075
           Validation Loss: 0.1567
           Validation Accuracy: 0.5640
Stale epoch
Early stopping after 15 stale epochs


### practice

In [6]:
!pip3 install ipywidgets


Collecting ipywidgets
  Downloading ipywidgets-8.1.1-py3-none-any.whl (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.4/139.4 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting widgetsnbextension~=4.0.9 (from ipywidgets)
  Downloading widgetsnbextension-4.0.9-py3-none-any.whl (2.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting jupyterlab-widgets~=3.0.9 (from ipywidgets)
  Downloading jupyterlab_widgets-3.0.9-py3-none-any.whl (214 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m214.9/214.9 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: widgetsnbextension, jupyterlab-widgets, ipywidgets
Successfully installed ipywidgets-8.1.1 jupyterlab-widgets-3.0.9 widgetsnbextension-4.0.9

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [

In [18]:
len(train_loader.dataset)

250

In [13]:
outputs.squeeze().shape, y.reshape(1,-1).shape

(torch.Size([32]), torch.Size([1, 32]))

In [11]:
y.squeeze()

tensor([1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1,
        1, 1, 0, 1, 1, 0, 1, 1], device='mps:0')

Training and Testing

In [12]:
# Example of target with class indices
# loss = nn.CrossEntropyLoss()
# input = torch.randn(3, 5, requires_grad=True)
# target = torch.empty(3, dtype=torch.long).random_(5)
# output = loss(input, target)
# output.backward()
# # Example of target with class probabilities
input = torch.randn(3, 5, requires_grad=True)
target = torch.randn(3, 5).softmax(dim=1)
output = loss(input, target)
output.backward()

In [13]:
input

tensor([[ 2.0984, -0.8004, -1.2692,  0.0133,  0.8583],
        [-2.5194, -1.6218,  2.3816, -0.0952, -0.3689],
        [ 0.6391, -1.4880,  0.7888, -0.8413,  1.8981]], requires_grad=True)

In [14]:
target

tensor([[0.6053, 0.0873, 0.1340, 0.0427, 0.1307],
        [0.1373, 0.1314, 0.5137, 0.0558, 0.1618],
        [0.3586, 0.4966, 0.0283, 0.0868, 0.0297]])

In [15]:
m = nn.Sigmoid()
loss = nn.BCELoss()
input = torch.randn(3, 2, requires_grad=True)
target = torch.rand(3, 2, requires_grad=False)
output = loss(m(input), target)
output.backward()


In [4]:
!pip install torch-scatter


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [15]:
from torch_scatter import scatter_mean
src = torch.Tensor([[2, 0, 4, 4, 3], [0, 2, 1, 3, 4]])
index = torch.tensor([[4, 4, 4, 2, 3], [1,1,1,1, 1]])
out = src.new_zeros((2, 5))

scatter_mean(src, index, out=out)

print(out)

tensor([[0., 0., 4., 3., 2.],
        [0., 2., 0., 0., 0.]])


In [37]:
src = torch.Tensor([[10,10, 10, 5, 10], [4, 4, 4, 2, 3] , [0, 2, 1, 3, 4] , [2, 0, 4, 4, 3] ])

index = torch.tensor([0, 0, 0, 0, 0])

out = scatter_mean(src, index)
print(out)

tensor([[9.0000],
        [3.4000],
        [2.0000],
        [2.6000]])


In [39]:
torch.mean(src, dim=-1, dtype=torch.float32)

tensor([9.0000, 3.4000, 2.0000, 2.6000])

In [23]:
(2+4+4+3)/5

2.6

In [36]:
src = torch.tensor([[2, 0, 4, 4, 3], [0, 2, 1, 3, 4]])
index = torch.tensor([0, 0, 0, 0, 0])

out = scatter_mean(src, index)

print(out)

tensor([[2],
        [2]])
