In [1]:
#!/usr/bin/env python
# coding: utf-8

# # Embedding Network

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

import pandas as pd

In [316]:
class EmbeddingModel(nn.Module):
    def __init__(self, datamap, y_col = "y"):
        super(EmbeddingModel, self).__init__()
        
        self.datamap = datamap
        self.y_col = y_col

        # 임베딩 레이어 초기화
        for k, v in datamap.items() :
            if v == "linear" :
                setattr(self, k, nn.Linear(num_linear_features, embedding_dim))
            elif v == "onehot" :
                setattr(self, k, nn.Embedding(num_sparse_features, embedding_dim))
            elif v == "multihot" :
                setattr(self, k, nn.Embedding(num_sparse_features, embedding_dim))
        fc1_embedding_dim = len(datamap) - 1
        # 다층 퍼셉트론(MLP) 레이어 초기화 
        self.fc1 = nn.Linear(embedding_dim * fc1_embedding_dim, hidden_dim)  # 임베딩된 특성이 3개이므로 *3
        self.fc2 = nn.Linear(hidden_dim, output_dim)
    
    def _get_multihot_embedding(self, embedding, method_multihot) :

        if method_multihot == "mean" :
            # 멀티-핫 특성의 임베딩 평균
            multihot_embedding = lambda x : torch.mean(embedding(x), dim = 1)
        elif method_multihot == "sum" :
            # 각 인덱스에 대한 임베딩 벡터를 더하여 합산
            multihot_embedding = lambda x : torch.sum(embedding(x), dim = 1)
            # multi_hot_embedded = torch.sum(embedding(x[k]), dim=1)
        elif method_multihot == "weighted_mean" :
            # 각 인덱스에 대한 임베딩 벡터를 가져오고 가중 평균 계산
            weights = torch.ones_like(emb_vectors)  # 간단히 모든 값에 대해 동일한 가중치 사용
            multihot_embedding = lambda x : torch.mean(embedding(x) * weights, dim=1)

        return multihot_embedding
        
    
    def forward(self, x, method_multihot = "sum"):
        # sparse feature 임베딩
        embedded = {}
        for k, v in self.datamap.items() :
            if k == self.y_col :
                continue

            embedding = getattr(self, k)

            if v == "multihot" :
                embedding = self._get_multihot_embedding(embedding, method_multihot)

            embedding_value = embedding(x[k])
            embedded[k] = embedding_value

        # 모든 임베딩된 특성을 결합
        combined_features = torch.cat([v for k, v in embedded.items()], dim=1)  # dim=1은 각 임베딩을 행 방향으로 결합
        
        # MLP 레이어 적용
        x = F.relu(self.fc1(combined_features))
        x = self.fc2(x)
        
        return x

In [371]:
class CustomDataset(Dataset): 
    def __init__(self, df, datamap, y_col):
        
        self.x_data = {}
        for col, v in datamap.items() :
            if v == "linear" :
                self.x_data[col] = torch.FloatTensor(df[col].values).unsqueeze(1)
            if v == "multihot" :
                df = df.assign(**{col : lambda x : self._pad_sequence(x[col])})
                self.x_data[col] = torch.LongTensor(df[col].to_list())
            if v == "onehot" :
                self.x_data[col] = torch.LongTensor(df[col].values)
        
        self.y_data = torch.LongTensor(df[y_col].values)

  # 총 데이터의 개수를 리턴
    def __len__(self):
        return len(self.y_data)

  # 인덱스를 입력받아 그에 맵핑되는 입출력 데이터를 파이토치의 Tensor 형태로 리턴
    def __getitem__(self, idx):
        x = {x : y[idx] for x, y in self.x_data.items()}
        y = self.y_data[idx]
        return x, y
    
    def _get_max_multihot_size(self, series):
        return series.apply(len).max()

    def _get_max_multihot_value(self, series):
        return series.apply(max).max()
        
    def _pad_infinite(self, iterable, padding=None):
        from itertools import chain, repeat, islice
        return chain(iterable, repeat(padding))
    
    def _pad(self, iterable, size, padding=None):
        return list(islice(pad_infinite(iterable, padding), size))
        
    def _pad_sequence(self, series) :
        l = self._get_max_multihot_size(series)
        m = self._get_max_multihot_value(series)
        return series.apply(lambda x : self._pad(x, l, m + 1))

In [366]:
datamap = {"a" : "linear", "b" : "multihot", "c" : "onehot", "d" : "multihot", "y" : "onehot"}

df = pd.DataFrame(
    {
        "a" : [1.0, 2.0, -1.2], 
        "b" : [[1, 2, 3], [4, 5, 2], [2, 0, 3]], 
        "c" : [1, 2, 3], 
        "d" : [[1, 2, 3], [4, 5, 2], [2, 0, 3]], 
        "y" : [1, 2, 3]}
)

In [367]:
dataset = CustomDataset(df, datamap, "y")
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

In [368]:
# 모델 인스턴스 생성
num_sparse_features = 6  # Embedding Idx의 최대값 + 1
num_linear_features = 1  # linear feature의 개수 - 개별로 넣으면 1, 묶음이면 len(feature)
embedding_dim = 10
hidden_dim = 20
output_dim = 1  # 예측할 출력의 차원 (예: 회귀의 경우 1, 이진 분류의 경우 1)
print_ok = True

In [369]:
model = EmbeddingModel(datamap)
model

EmbeddingModel(
  (a): Linear(in_features=1, out_features=10, bias=True)
  (b): Embedding(6, 10)
  (c): Embedding(6, 10)
  (d): Embedding(6, 10)
  (y): Embedding(6, 10)
  (fc1): Linear(in_features=40, out_features=20, bias=True)
  (fc2): Linear(in_features=20, out_features=1, bias=True)
)

In [370]:
for batch_idx, samples in enumerate(dataloader):
    # print(batch_idx)
    x, y = samples 
    # H(x) 계산
    output = model(x)
    print(output)

tensor([[ 0.0637],
        [-0.1611]], grad_fn=<AddmmBackward0>)
tensor([[-0.1214]], grad_fn=<AddmmBackward0>)


In [9]:
# # Siames Network
class SiameseNetwork(nn.Module):
    def __init__(self, model1, model2):
        super(SiameseNetwork, self).__init__()
        self.model1 = model1
        self.model2 = model2

    def forward(self, input1, input2):
        output1 = self.model1(input1)
        output2 = self.model2(input2)
        return output1, output2

In [300]:
datamap1 = {"a" : "linear", "b" : "multihot", "c" : "onehot", "d" : "multihot", "y" : "onehot"}

df1 = pd.DataFrame(
    {
        "a" : [1.0, 2.0, -1.2], 
        "b" : [[1, 2, 3], [4, 5, 2], [2, 0, 3]], 
        "c" : [1, 2, 3], 
        "d" : [[1, 2, 3], [4, 5, 2], [2, 0, 3]], 
        "y" : [1, 2, 3]}
)


datamap2 = {"a" : "linear", "b" : "multihot", "c" : "onehot", "y" : "onehot"}

df2 = pd.DataFrame(
    {
        "a" : [1.0, 2.0, -1.2], 
        "b" : [[1, 2, 3], [4, 5, 2], [2, 0, 3]], 
        "c" : [1, 2, 3], 
        "y" : [1, 2, 3]}
)

In [301]:
dataset1 = CustomDataset(df1, datamap1, "y")
dataloader1 = DataLoader(dataset1, batch_size=2, shuffle=True)

dataset2 = CustomDataset(df2, datamap2, "y")
dataloader2 = DataLoader(dataset2, batch_size=2, shuffle=True)

In [302]:
# 모델 인스턴스 생성
num_sparse_features = 6  # Embedding Idx의 최대값 + 1
num_linear_features = 1  # linear feature의 개수 - 개별로 넣으면 1, 묶음이면 len(feature)
embedding_dim = 10
hidden_dim = 20
output_dim = 1  # 예측할 출력의 차원 (예: 회귀의 경우 1, 이진 분류의 경우 1)
print_ok = True

In [303]:
model1 = EmbeddingModel(datamap1)
model2 = EmbeddingModel(datamap2)

In [304]:
siam_model = SiameseNetwork(model1, model2)
siam_model

SiameseNetwork(
  (model1): EmbeddingModel(
    (a): Linear(in_features=1, out_features=10, bias=True)
    (b): Embedding(6, 10)
    (c): Embedding(6, 10)
    (d): Embedding(6, 10)
    (y): Embedding(6, 10)
    (fc1): Linear(in_features=40, out_features=20, bias=True)
    (fc2): Linear(in_features=20, out_features=1, bias=True)
  )
  (model2): EmbeddingModel(
    (a): Linear(in_features=1, out_features=10, bias=True)
    (b): Embedding(6, 10)
    (c): Embedding(6, 10)
    (y): Embedding(6, 10)
    (fc1): Linear(in_features=30, out_features=20, bias=True)
    (fc2): Linear(in_features=20, out_features=1, bias=True)
  )
)

In [312]:
for batch_idx, data in enumerate(zip(dataloader1, dataloader2)):
    _data1 = data[0]
    _data2 = data[1]
    x1, y1 = _data1
    x2, y2 = _data2
    
    output1, output2 = siam_model(x1, x2)
    print(output1, output2)

Embedding(6, 10)
Embedding(6, 10)
Embedding(6, 10)
tensor([[ 0.4837],
        [-0.1243]], grad_fn=<AddmmBackward0>) tensor([[-0.0366],
        [-0.1064]], grad_fn=<AddmmBackward0>)
Embedding(6, 10)
Embedding(6, 10)
Embedding(6, 10)
tensor([[-0.0092]], grad_fn=<AddmmBackward0>) tensor([[-0.2142]], grad_fn=<AddmmBackward0>)


In [138]:
import random

def make_onehot_data(length, *args) :
    return [random.randint(*args[0]) for _ in range(length)]

def make_linear_data(length, *args) :
    return [random.random() * args[0] for _ in range(length)]

def make_multihot(length, *args) :
    args = args[0]
    multihot_length = args[0]
    min_range = args[1]
    max_range = args[2]
    
    return [[random.randint(min_range, max_range) for _ in range(multihot_length)] for _ in range(length)]

def random_config(feature_type) :

    if feature_type == "onehot" :
        val1 = random.randint(1, 30)
        val2 = random.randint(1, 30)
        return min(val1, val2), max(val1, val2)
        
    if feature_type == "linear" :
        return random.randint(1, 10)
    
    if feature_type == "multihot" :
        val1 = random.randint(1, 30)
        val2 = random.randint(1, 30)
        return random.randint(1, 10), min(val1, val2), max(val1, val2)
    
    return

def make_independent_data(datamap, length) :

    data = {}
    
    for k, v in datamap.items() :
        if v == "onehot" :
            data[k] = make_onehot_data(length, random_config(v))
        if v == "linear" :
            
            data[k] = make_linear_data(length, random_config(v))
        if v == "multihot" :
            data[k] = make_multihot(length, random_config(v))
    
    return pd.DataFrame(data)


def make_dataset(target_variable, datamap, length) :
    
    return pd.concat([make_independent_data(datamap, length).assign(y  = i) for i in range(target_variable)])

In [438]:
datamap1 = {"a" : "linear", "b" : "multihot", "c" : "onehot", "d" : "multihot", "y" : "onehot"}
datamap2 = {"a" : "linear", "b" : "multihot", "c" : "onehot", "y" : "onehot"}

In [439]:
data1 = make_dataset(3, datamap1, 1000)
data2 = make_dataset(3, datamap2, 1000)

In [465]:
class SiamesDataset(Dataset): 
    def __init__(self, df1, datamap1, df2, datamap2, df_y, y_col = "y"):
        
        self.x_data1 = self._get_x_data(df1, datamap1)
        self.x_data2 = self._get_x_data(df2, datamap2)
        self.y_data = torch.LongTensor(df_y[y_col].values)

  # 총 데이터의 개수를 리턴
    def __len__(self):
        return len(self.y_data)

  # 인덱스를 입력받아 그에 맵핑되는 입출력 데이터를 파이토치의 Tensor 형태로 리턴
    def __getitem__(self, idx):
        x1 = {x : y[idx] for x, y in self.x_data1.items()}
        x2 = {x : y[idx] for x, y in self.x_data2.items()}
        y = self.y_data[idx]
        return x1, x2, y

    def _get_x_data(self, df, datamap) :
        
        x_data = {}
        for col, v in datamap.items() :
            if v == "linear" :
                x_data[col] = torch.FloatTensor(df[col].values).unsqueeze(1)
            if v == "multihot" :
                df = df.assign(**{col : lambda x : self._pad_sequence(x[col])})
                x_data[col] = torch.LongTensor(df[col].to_list())
            if v == "onehot" :
                x_data[col] = torch.LongTensor(df[col].values)

        return x_data
    
    def _get_max_multihot_size(self, series):
        return series.apply(len).max()

    def _get_max_multihot_value(self, series):
        return series.apply(max).max()
        
    def _pad_infinite(self, iterable, padding=None):
        from itertools import chain, repeat, islice
        return chain(iterable, repeat(padding))
    
    def _pad(self, iterable, size, padding=None):
        return list(islice(pad_infinite(iterable, padding), size))
        
    def _pad_sequence(self, series) :
        l = self._get_max_multihot_size(series)
        m = self._get_max_multihot_value(series)
        return series.apply(lambda x : self._pad(x, l, m + 1))

In [467]:
dataset = SiamesDataset(data1, datamap1, data2, datamap2, data1, "y")
dataloader = DataLoader(dataset, batch_size=10, shuffle=True)

In [451]:
def search_num_sparse_features(df, datamap, y_col = "y") :
    datamap = datamap.copy()
    datamap.pop("y")
    return max(*[df[k].apply(max).max() if v == "multihot" else df[k].max() for k, v in datamap.items() if v in ["multihot", "onehot"]])

In [468]:
# 모델 인스턴스 생성
num_sparse_features = max(search_num_sparse_features(data1, datamap1), search_num_sparse_features(data2, datamap2)) + 1  # Embedding Idx의 최대값 + 1
num_linear_features = 1  # linear feature의 개수 - 개별로 넣으면 1, 묶음이면 len(feature)
embedding_dim = 100
hidden_dim = 20
output_dim = 1  # 예측할 출력의 차원 (예: 회귀의 경우 1, 이진 분류의 경우 1)
print_ok = True

In [469]:
model1 = EmbeddingModel(datamap1)
model2 = EmbeddingModel(datamap2)

In [470]:
siam_model = SiameseNetwork(model1, model2)
siam_model

SiameseNetwork(
  (model1): EmbeddingModel(
    (a): Linear(in_features=1, out_features=100, bias=True)
    (b): Embedding(30, 100)
    (c): Embedding(30, 100)
    (d): Embedding(30, 100)
    (y): Embedding(30, 100)
    (fc1): Linear(in_features=400, out_features=20, bias=True)
    (fc2): Linear(in_features=20, out_features=1, bias=True)
  )
  (model2): EmbeddingModel(
    (a): Linear(in_features=1, out_features=100, bias=True)
    (b): Embedding(30, 100)
    (c): Embedding(30, 100)
    (y): Embedding(30, 100)
    (fc1): Linear(in_features=300, out_features=20, bias=True)
    (fc2): Linear(in_features=20, out_features=1, bias=True)
  )
)

In [472]:
class ContrastiveLoss(torch.nn.Module):
    """
    Contrastive loss function.
    Based on: http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
    """

    def __init__(self, margin=2.0):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin

    def forward(self, output1, output2, label):
        euclidean_distance = F.pairwise_distance(output1, output2, keepdim = True)
        loss_contrastive = torch.mean((1-label) * torch.pow(euclidean_distance, 2) +
                                      (label) * torch.pow(torch.clamp(self.margin - euclidean_distance, min=0.0), 2))


        return loss_contrastive

In [479]:
net = SiameseNetwork(model1, model2)
criterion = ContrastiveLoss()
optimizer = torch.optim.Adam(net.parameters(),lr = 0.0005 )

counter = []
loss_history = [] 
iteration_number= 0

for epoch in range(0, 100):
    for i, data in enumerate(dataloader):
        x1, x2, y = data
        
        optimizer.zero_grad()
        output1, output2 = net(x1, x2)
        
        loss_contrastive = criterion(output1,output2,y)
        loss_contrastive.backward()
        optimizer.step()
    if epoch %10 == 0 :
        print("Epoch number {}\n Current loss {}\n".format(epoch,loss_contrastive.item()))
        iteration_number +=10
        counter.append(iteration_number)
        loss_history.append(loss_contrastive.item())

Epoch number 0
 Current loss -2311185152.0

Epoch number 10
 Current loss 5321580032.0

Epoch number 20
 Current loss -10670310400.0

Epoch number 30
 Current loss -62945775616.0

Epoch number 40
 Current loss 1310.719970703125

Epoch number 50
 Current loss -496305274880.0

Epoch number 60
 Current loss 326192201728.0

Epoch number 70
 Current loss -1345123516416.0

Epoch number 80
 Current loss -5114293125120.0

Epoch number 90
 Current loss 823267688448.0



In [492]:
torch.sum(siam_model.model1.b(dataset.__getitem__(0)[0]["b"]), dim = 1)

tensor([ 15.6772,   4.7668,   8.2523,   9.4473,  15.6492, -48.7647, -48.7647,
        -48.7647], grad_fn=<SumBackward1>)