In [None]:
import nltk, hashlib, torch
from nltk import ngrams, word_tokenize

#nltk.download("punkt")
unigramCount = 200000
bigramCount = 1000000
trigramCount = 64000
def create_string_embeddings(text):
    bow = []

    for s in text:
        # word unigram 200.000
        print("first text is::::::", s)
        uni = list(ngrams( word_tokenize(s), 1 ) )
        unigrams = ["".join(i) for i in uni ]
        unigramN = [int(hashlib.sha1(p.encode('utf-8')).hexdigest(), 16) % (unigramCount-1) + 1 for p in unigrams]
        #print("======word unigrams:", unigrams, unigramN)

        # word bigrams 1.000.000
        bis = list( ngrams( word_tokenize(s), 2) )
        bigrams = ["".join( i[0]+" "+i[1] ) for i in bis]
        bigramN = [int(hashlib.sha1(p.encode('utf-8')).hexdigest(), 16) % (bigramCount-1) + unigramCount + 1 for p in bigrams]
        #print("word bigrams:", bigrams, bigramN )

        #character trigrams 64.000
        tris = list(ngrams(s,3))
        trigrams = ["".join(i) for i in tris]
        trigramN = [int(hashlib.sha1(p.encode('utf-8')).hexdigest(), 16) % (trigramCount-1) + bigramCount + unigramCount + 1 for p in trigrams]

        #print("character trigrams:", trigrams, trigramN )
        cur = unigramN + bigramN + trigramN
        cur = cur[0:99]
        bow.append( cur + [0]*(100-len(cur)))
        #print("bow is:", bow)
    
    return bow

In [None]:
import math, torch, torch.nn as nn
import torch.nn.functional as F
import mmh3
from torch.nn import TransformerEncoder, TransformerEncoderLayer

class TransformerModel(nn.Module):

    def __init__(self, ntoken, emsize, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        self.model_type = 'Transformer'
        self.ntokens = ntoken
        
        self.E = nn.Embedding(ntoken,256,padding_idx=0)
        self.W = nn.Embedding(unigramCount+bigramCount+trigramCount, 2)
        self.clsToken = torch.randn((1,512))
        
        self.textLinear = nn.Linear(100*256,512)
        self.imageLinear = nn.Linear(2048,512)
        self.clsLinear = nn.Linear(512,512)
        
        encoder_layers = TransformerEncoderLayer(emsize, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        
        self.seq = nn.Sequential(
            nn.LayerNorm(512),
            nn.Linear(512, 1024),
            nn.GELU(),
            nn.Linear(1024, 256))

        #self.init_weights()

    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, txtsrc, imgsrc):        
        out = create_string_embeddings(txtsrc)
        out = torch.Tensor(out)
        
        # Get 2 different hash index from W for each token ID & Get 2 importance weight from E for each token ID
        txtEmbeddings = torch.Tensor()
        for ems in out: 
            stringEmbedding = torch.Tensor([])
            for hvalue in ems:
                if hvalue != 0:
                    hash1index = mmh3.hash(str(hvalue), seed=1) % self.ntokens
                    hash2index = mmh3.hash(str(hvalue), seed=2) % self.ntokens
                    hashes = self.E( torch.IntTensor([hash1index, hash2index]) )
                    
                    importanceWeights = self.W( hvalue.int() )
                    #print("here is the hashes:", hashes, " importance weights: ", importanceWeights)
                    #print("final embeddings are:", (hashes[0]*importanceWeights[0] + hashes[1]*importanceWeights[1])/2 )
                    finalEmbedding = (hashes[0]*importanceWeights[0] + hashes[1]*importanceWeights[1])/2
                    stringEmbedding = torch.cat([stringEmbedding, finalEmbedding])
                    #print("string Embedding is: ", stringEmbedding.shape )
                else:
                    stringEmbedding = torch.cat([stringEmbedding, torch.zeros([256])])
            #print("Last string Embedding is: ", stringEmbedding.unsqueeze(0).shape )
            txtEmbeddings = torch.cat([txtEmbeddings, stringEmbedding.unsqueeze(0)])
        
        
        # pass the hashes to textLinear
        txtEmbeddings = self.textLinear(txtEmbeddings)
        
        # pass the image embeddings to imgLinear
        imgEmbeddings = self.imageLinear(imgsrc)
        
        # get cls token
        clsToken = self.clsLinear(self.clsToken)
        
        allEmbeddings = torch.cat([clsToken, txtEmbeddings, imgEmbeddings])
        print("All Embedding shape: ", allEmbeddings.shape)
        
        output = self.transformer_encoder(allEmbeddings)
        print(output[0].shape)
        output = self.seq( output[0] )
        return output

In [None]:
from transformers import ResNetModel, AutoProcessor
import matplotlib.pyplot as plt

ntokens = 100000 # the size of vocabulary
emsize = 512 # embedding dimension
nhid = 512 # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 1 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 8 # the number of heads in the multiheadattention models
dropout = 0.2 # the dropout value

model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout)

# GET INPUT

# Create embeddings for strings
txtsrc = ["NPET K10 Wired Gaming Keyboard, LED Backlit, Spill-Resistant Design, Multimedia Keys"]
txtsrc.append("Gaming, Membrane, Multimedia, Mechanical")
txtsrc.append("10-Zone RGB Lighting: With 16.8 million colors and a suite")

# Get embeddings for images
model_name = "microsoft/resnet-50"
processor = AutoProcessor.from_pretrained(model_name)
resnetModel = ResNetModel.from_pretrained(model_name)
images = ["samples/key1.jpg", "samples/key2.jpg", "samples/key3.jpg", "samples/key4.jpg", "samples/key5.jpg"]

imgEmbeddings = torch.Tensor()
for img in images:
    imread = plt.imread(img)
    processedImage = processor(imread,return_tensors="pt")["pixel_values"]
    sonuc = resnetModel(processedImage)
    imgEmbeddings = torch.cat( [imgEmbeddings, sonuc.pooler_output.squeeze(-1).squeeze(-1)] )

print(imgEmbeddings)

model(txtsrc, imgEmbeddings).shape