In [1]:
import pandas as pd
import numpy as np
import torch.nn as nn
from torch.utils.data import Dataset
import torch.nn.functional as F
import ast
from joblib import dump, load
import torch
from torch.utils.data import DataLoader


import torch.nn.functional as F
from torch.nn import (
    Sequential as Seq,
    Linear as Lin,
    ReLU,
    BatchNorm1d,
    AvgPool1d,
    Sigmoid,
    Conv1d,
)

from deepsetmodel import *

if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

testset = pd.read_csv('dataset/paired_testset.csv', sep=',',index_col=0)


In [2]:
class Data(Dataset):
    def __init__(self,df,feature = 'tfdif', support_pipe = 'pipes/support-tfidf.joblib', opposition_pipe = 'pipes/oppose-tfidf.joblib'):
        self.df = df    
        supports = self.df['support'].values
        oppositions = self.df['opposition'].values
        self.y = self.df['outcome'].values 
        # convert list of stings to list of lists of stings
        supports = list(map(lambda x: ast.literal_eval(x), supports))
        oppositions = list(map(lambda x: ast.literal_eval(x), oppositions))

        self.max_len_brief = max(self.findMaxLen(supports),self.findMaxLen(oppositions))

        if feature == 'tfdif':
            support_pipe = load(support_pipe)
            opposition_pipe = load(opposition_pipe)
            getSupport = lambda x: self.stringsToTfidfs(x,support_pipe)
            getOpposition = lambda x: self.stringsToTfidfs(x,opposition_pipe)

            self.supports = list(map( getSupport, supports))
            self.oppositions = list(map( getOpposition, oppositions))

        elif feature == 'embedding':
            self.supports = []
            self.oppositions = []
    
    def __len__(self):
        return len(self.supports)
    
    def __getitem__(self, idx):
        y = 1 if self.y[idx] == 'grant' else 0

        return self.supports[idx] , self.oppositions[idx] , y
    
    def findMaxLen(self,x):
        max_len = 0
        for i in range(len(x)):
            row = x[i]
            if len(row) > max_len:
                max_len = len(row)
        return max_len

    def stringsToTfidfs(self,briefs,pipe):
        tfidfs = torch.tensor(pipe.transform(briefs).toarray(),dtype=torch.float32)

        num_padding = self.max_len_brief - tfidfs.shape[0]

        padding = nn.ConstantPad2d((0, 0, 0, num_padding), 0)

        tfidfs = padding(tfidfs)

        tfidfs = tfidfs.T
        return tfidfs

    
    def stringsToEmbeddings(self): 
        pass

train_data = Data(testset[testset['data_type'] == 'train'])
test_data = Data(testset[testset['data_type'] == 'test']) 

batch_size = 32

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

The input size for TFIDF is quite big, so instead of doubling the feauture size of the hidden layers.
I simplely added it by 100 and 200 units respectively.

How do i construct the input?
- Should i have two different sets?
- Or chug them all in the same set?


I can not load multple models, because the TFIDF vectors creates large weights which in turn makes the model large

There might be a way to mitigate this problem with:

https://pytorch.org/docs/stable/generated/torch.sparse_coo_tensor.html

In [3]:


# TFIDF is quiet big so i may have to reduce the hiden layers width
input_size = train_data.supports[0].shape[0]
max_len_brief = train_data.max_len_brief

# the latent space has the be atleast the size of the input
hidden1 = int(input_size /5)
hidden2 = int(hidden1 / 4)
hidden3 = int(hidden2 / 3)
classify1 = int(hidden3 /2)

models = {}
optimizers = {}

models["support"] = DeepSets(input_size, max_len_brief , hidden1, hidden2, hidden3, classify1).to(device)

#models["opposition"] = DeepSets(input_size,max_len_brief,  hidden1, hidden2, hidden3, classify1).to(device)

#models["both"] =

## what does Bachnorm and conv1d work?




optimizers["suppport"] = torch.optim.Adam(models["support"].parameters(), lr=1e-2)
#optimizers["opposition"] = torch.optim.Adam(models["opposition"].parameters(), lr=1e-2)
#optimizers["both"] = torch.optim.Adam(models["both"].parameters(), lr=1e-2)

Define training, validation, testing data generators

In [4]:
from tqdm.notebook import tqdm

@torch.no_grad()
def test(models, loader, total, batch_size, leave=False):
    for model in models.values():
        model.eval()
    
    xentropy = nn.CrossEntropyLoss(reduction='mean')

    sum_loss = 0.0
    t = tqdm(enumerate(loader), total=total /batch_size, leave=leave)

    for i, data in t:

        supports, oppositions, y = data
        supports = supports.to(device)
        oppositions = oppositions.to(device)
        y = y.to(device)

        outputs_support = models["support"](supports)
        outputs_opposition = models["oppostion"](oppositions)
        outputs_both = models["both"](outputs_support, outputs_opposition)

        loss_support = xentropy(outputs_both, y)
        sum_loss_support += loss_support.item()

        loss_opposition = xentropy(outputs_both, y)
        sum_loss_oppostion += loss_opposition.item()

        loss_both = xentropy(outputs_both, y)
        sum_loss_both += loss_both.item()

        t.set_description(f"loss_support: {sum_loss_support:.4f} \nloss_opposition: {sum_loss_oppostion:.4f} \nloss_both: {sum_loss_both:.4f}")
        
        t.refresh()
    # what is the (i+1) for?
        
    return sum_loss_support  / (i + 1) , sum_loss_oppostion  / (i + 1), sum_loss_both  / (i + 1)


def train(models, optimizers, loader, total, batch_size, leave=False):
    for model in models.values():
        model.train()

    xentropy = nn.CrossEntropyLoss(reduction='mean')

    sum_loss = 0.0
    t = tqdm(enumerate(loader), total=total /batch_size, leave=leave)
    for i, data in t:
        supports, oppositions, y = data
        supports = supports.to(device)
        oppositions = oppositions.to(device)
        y = y.to(device)

        optimizers['support'].zero_grad()
        optimizers['opposition'].zero_grad()
        optimizers['both'].zero_grad()

        outputs_support = models["support"](supports)
        outputs_opposition = models["opposition"](oppositions)
        outputs_both = models["both"](outputs_support, outputs_opposition)

        loss_support = xentropy(outputs_both, y)
        sum_loss_support += loss_support.item()

        loss_opposition = xentropy(outputs_both, y)
        sum_loss_oppostion += loss_opposition.item()

        loss_both = xentropy(outputs_both, y)
        sum_loss_both += loss_both.item()


        loss_support.backward()
        loss_opposition.backward()
        loss_both.backward()

        optimizers['support'].step()
        optimizers['opposition'].step()
        optimizers['both'].step()
        
        t.set_description(f"loss_support: {sum_loss_support:.4f} \nloss_opposition: {sum_loss_oppostion:.4f} \nloss_both: {sum_loss_both:.4f}")
        t.refresh()


In [5]:
loader =iter(train_loader)
supports, oppositions, y = next(loader)
supports = supports.to(device)
oppositions = oppositions.to(device)
y = y.to(device)


In [6]:
supports.shape, models["support"](supports).shape

  size[dim] = int(index.max()) + 1


: 

In [8]:
input_size

8

In [20]:
# this is to understand what i does
t = tqdm(enumerate(loader), total=total /batch_size, leave=leave)

for i, data in t:
    print(i)


1

Training and Testing

In [12]:
# Example of target with class indices
# loss = nn.CrossEntropyLoss()
# input = torch.randn(3, 5, requires_grad=True)
# target = torch.empty(3, dtype=torch.long).random_(5)
# output = loss(input, target)
# output.backward()
# # Example of target with class probabilities
input = torch.randn(3, 5, requires_grad=True)
target = torch.randn(3, 5).softmax(dim=1)
output = loss(input, target)
output.backward()

In [13]:
input

tensor([[ 2.0984, -0.8004, -1.2692,  0.0133,  0.8583],
        [-2.5194, -1.6218,  2.3816, -0.0952, -0.3689],
        [ 0.6391, -1.4880,  0.7888, -0.8413,  1.8981]], requires_grad=True)

In [14]:
target

tensor([[0.6053, 0.0873, 0.1340, 0.0427, 0.1307],
        [0.1373, 0.1314, 0.5137, 0.0558, 0.1618],
        [0.3586, 0.4966, 0.0283, 0.0868, 0.0297]])

In [15]:
m = nn.Sigmoid()
loss = nn.BCELoss()
input = torch.randn(3, 2, requires_grad=True)
target = torch.rand(3, 2, requires_grad=False)
output = loss(m(input), target)
output.backward()


In [4]:
!pip install torch-scatter


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [15]:
from torch_scatter import scatter_mean
src = torch.Tensor([[2, 0, 4, 4, 3], [0, 2, 1, 3, 4]])
index = torch.tensor([[4, 4, 4, 2, 3], [1,1,1,1, 1]])
out = src.new_zeros((2, 5))

scatter_mean(src, index, out=out)

print(out)

tensor([[0., 0., 4., 3., 2.],
        [0., 2., 0., 0., 0.]])


In [9]:
src = torch.tensor([[10,10, 10, 5, 10], [4, 4, 4, 2, 3] , [0, 2, 1, 3, 4] , [2, 0, 4, 4, 3] ]).to("mps")
ntracks = 5
index = torch.tensor(np.zeros(ntracks), dtype=torch.float32).to("mps")

out = scatter_mean(src, index, dim=-1)
print(out)
print(torch.squeeze(out))

RuntimeError: scatter(): Expected dtype int64 for index

In [67]:
x = torch.zeros(2, 2, 4, 4, 2)
torch.squeeze(x).shape, x.shape

(torch.Size([2, 2, 4, 4, 2]), torch.Size([2, 2, 4, 4, 2]))