In [1]:
import os
import nltk
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import torch.nn.functional as F
import torch.nn as nn
import torchvision.models

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from PIL import Image
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import word_tokenize

from base import BaseModel
from loader import ImagesDataset

In [2]:
data = pd.read_csv("Flickr8k_text/Flickr8k.token.txt", sep="\t")

In [3]:
data = data.drop_duplicates()

In [4]:
data.columns = ["id", "desc"]

In [5]:
data.id = data.id.map(lambda x: x[:len(x) - 2])

In [6]:
lemmatizer = WordNetLemmatizer()
english_stop_words = stopwords.words("english")

In [7]:
data["target"] = np.random.randint(0, 2, data.shape[0])

In [8]:
data.head()

Unnamed: 0,id,desc,target
0,1000268201_693b08cb0e.jpg,A girl going into a wooden building .,0
1,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .,0
2,1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playh...,0
3,1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a woo...,0
4,1001773457_577c3a7d70.jpg,A black dog and a spotted dog are fighting,1


In [9]:
data.target.unique()

array([0, 1])

In [10]:
data.shape

(40459, 3)

In [11]:
def english_tokinizer(sentence: str):
    sentence = sentence.strip().lower()
    tokens = word_tokenize(sentence)
    tokens = [
        lemmatizer.lemmatize(token)
        for token in tokens
        if token not in english_stop_words and token.isalnum()
    ]
    return " ".join(tokens)

In [12]:
data = data[:1000]

In [13]:
data.desc = data.desc.map(english_tokinizer)

In [14]:
X_train, X_valid, y_train, y_valid = train_test_split(
    data.drop("target", axis=1), data.target, test_size=0.33, random_state=42
)

In [15]:
batch_size = 15

train_dataset = ImagesDataset(
    images=X_train.id.to_numpy(),
    descriptions=X_train.desc.to_numpy(),
    targets=y_train.to_numpy()
)

val_dataset = ImagesDataset(
    images=X_valid.id.to_numpy(),
    descriptions=X_valid.desc.to_numpy(),
    targets=y_valid.to_numpy()
)


In [16]:
class CTRModel(BaseModel):
    def __init__(self, n_classes):
        super(CTRModel, self).__init__()
        self.n_classes = n_classes
        self.swin_t = torchvision.models.swin_s(pretrained=False)

        for param in self.swin_t.parameters():
            param.requires_grad_(False)

        modules = list(self.swin_t.children())[:-2]
        self.swin_t = nn.Sequential(*modules)

        self.text_linear1 = nn.Linear(860, 300)
        self.text_linear2 = nn.Linear(300, 150)
        self.text_linear3 = nn.Linear(150, 1)


        self.linear1 = nn.Linear(769, 300)
        self.linear2 = nn.Linear(300, 150)
        self.linear3 = nn.Linear(150, n_classes)
        # self.sigm = nn.Sigmoid()

    def forward(self, image):
        features = self.swin_t(image[0])
        features = features.view(features.size(0), -1)

        text = F.relu(self.text_linear1(image[1]))
        text = F.relu(self.text_linear2(text))
        text = F.relu(self.text_linear3(text))

        data1 = torch.concat([features, text], dim=-1)

        result = F.relu(self.linear1(data1))
        result = F.relu(self.linear2(result))
        return self.linear3(result)


In [17]:
model = CTRModel(2)
# model.cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max', patience=5)
# grad_scaler = torch.cuda.amp.GradScaler(enabled=False)
# criterion = nn.SLL()



In [18]:
history = model.fit(train_dataset, val_dataset, 500, batch_size=40, optimizer=optimizer)

epoch:   0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

Validation round:   0%|          | 0/17 [00:00<?, ?batch/s]

[tensor([0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1,
        1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1])] [torch.return_types.max(
values=tensor([1.5625, 1.6328, 1.6172, 1.6172, 1.6016, 1.6016, 1.6094, 1.5938, 1.6406,
        1.6016, 1.6016, 1.6094, 1.5469, 1.6016, 1.6406, 1.5703, 1.6016, 1.5859,
        1.6094, 1.5938, 1.5703, 1.5547, 1.5938, 1.6016, 1.6250, 1.6094, 1.6406,
        1.5859, 1.6406, 1.6094, 1.4141, 1.5781, 1.6250, 1.6328, 1.5938, 1.6250,
        1.6484, 1.6250, 1.5859, 1.5781], dtype=torch.bfloat16),
indices=tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))]
[tensor([0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1,
        1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1]), tensor([0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1,
        1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1])] [torch.return_types.m