In [181]:
import pandas as pd
import numpy as np
import math
import spacy
import random
from tqdm import tqdm
from PIL import Image
import cv2

In [213]:
import torch
import torch.nn as nn
import torchvision
from torch.utils.data import Dataset, DataLoader
#from torch.utils.tensorboard import SummaryWriter
from torchtext.data import Field, BucketIterator
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
import torchvision.transforms as T

In [183]:
from sklearn.model_selection import train_test_split

In [184]:
torch.__version__

'1.11.0'

In [185]:
df = pd.read_csv("captions.txt")

In [186]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

img_dic = "D:\\code\\deep learning\\photo discription\\Images\\"


In [187]:
train_df, valid_df = train_test_split(df, test_size=0.1, shuffle=True, random_state=28)

In [188]:
train_df

Unnamed: 0,image,caption
18855,3034585889_388d6ffcc0.jpg,A looker on catches a motocross bike on its si...
16755,2910758605_73a3f5a5c2.jpg,A couple are walking along a street while the ...
39285,708860480_1a956ae0f7.jpg,A boy in a swimming pool .
32399,3603116579_4a28a932e2.jpg,The baseball player is running after the ball .
12267,2622517932_57c52c376f.jpg,A young boy looks through the glass at an aqua...
...,...,...
16471,2893515010_4a3d9dcc67.jpg,A football player catches the ball as another ...
40195,950273886_88c324e663.jpg,A man grasps onto the rock face .
7200,2286823363_7d554ea740.jpg,A little boy jumping from one chair to another .
26117,3352697012_751b079bbb.jpg,"Two female wrestlers with long , blonde hair ,..."


In [189]:
class Vocabulary:
    def __init__(self, freq_threshold= 2, lang = 'en_core_web_sm',preprocessor=None,
                reverse = False):
        self.itos = {0: "<pad>", 1: "<sos>", 2: "<eos>", 3: "<unk>"}
        self.stoi = {"<pad>": 0, "<sos>": 1, "<eos>": 2, "<unk>": 3}
        self.tokenizer = spacy.load(lang)
        self.reverse = reverse
        self.preprocessor = preprocessor
        self.freq_threshold = freq_threshold
        
    def __len__(self):
        return len(self.itos)
    
    def tokenize(self, text):
        if self.reverse:
            return [ token.text.lower() for token in self.tokenizer.tokenizer(text)][::-1]
        else:
            return [ token.text.lower() for token in self.tokenizer.tokenizer(text)]
        
    def build_vocabulary(self, sentence_list):
        frequencies = {}
        idx = len(self.itos)
        
        for sentence in sentence_list:
            if self.preprocessor:
                sentence = self.preprocessor(sentence)
                
            for word in self.tokenize(sentence):
                if word in frequencies:
                    frequencies[word] += 1
                else:
                    frequencies[word] = 1

                if frequencies[word] == self.freq_threshold:
                    self.stoi[word] = idx
                    self.itos[idx] = word
                    idx += 1
                    
                    

    def numericalize(self, text):
        tokenized_text = self.tokenize(text)

        return [
            self.stoi[token] if token in self.stoi else self.stoi["<unk>"]
            for token in tokenized_text
        ]
    
    def back_text(self, token_list):
        s = ''
        

In [190]:
freq_threshold = 2
en_vocab = Vocabulary(freq_threshold=freq_threshold, reverse=False)

In [191]:
en_vocab.build_vocabulary(train_df['caption'])

In [192]:
en_vocab.tokenize("hello everyone")

['hello', 'everyone']

In [193]:
image_dict = 'D://code//deep learning//photo discription//Images//'

In [218]:
class CustomTranslationDataset(Dataset):
    def __init__(self, df, en_vocab, img_dict, transform=None):
        super().__init__()
        self.df = df
        self.en_vocab = en_vocab
        self.img_dict = img_dict
        self.transform = transform
        
    def __len__(self):
        return len(self.df)
    
    def get_numerical(self, sentence, vocab):
        """Numericalize given text using prebuilt vocab."""
        numericalized = [vocab.stoi["<sos>"]]   #vocab here is a class which is made using the voculary class
        numericalized.extend(vocab.numericalize(sentence))
        numericalized.append(vocab.stoi["<eos>"])
        return numericalized
    
    
    def __getitem__(self, index):
        en_numericalized = self.get_numerical(self.df.iloc[index]["caption"], self.en_vocab)
        image_name = self.df.iloc[index]["image"] #load in image name
        img_location  = self.img_dict + image_name
        img = Image.open(img_location).convert("RGB")
        if self.transform is not None:
            img = self.transform(img)
        return img, torch.tensor(en_numericalized)

In [219]:
class CustomCollate:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx
        
    def __call__(self, batch):
        src = [item[0] for item in batch]
        src = pad_sequence(src, batch_first=False, padding_value = self.pad_idx)
        
        target = [item[1] for item in batch]
        target = pad_sequence(target, batch_first=False, padding_value = self.pad_idx)
        return src, target

In [220]:
transforms = T.Compose([
    T.Resize(226),                     
    T.RandomCrop(224),                 
    T.ToTensor(),                               
    T.Normalize((0.485, 0.456, 0.406),(0.229, 0.224, 0.225))
])

train_dataset = CustomTranslationDataset(train_df, en_vocab, image_dict, transforms)
valid_dataset = CustomTranslationDataset(valid_df, en_vocab, image_dict, transforms)

In [221]:
len(train_dataset)

36409

In [223]:
train_dataset[18][0]

tensor([[[ 0.1426, -0.0116,  0.2796,  ...,  0.1768,  0.0398, -0.2513],
         [-0.0287, -0.4739, -0.0116,  ...,  0.1426, -0.4568,  0.0227],
         [-0.1657, -0.0287, -0.1486,  ..., -0.9705, -1.2788, -0.4054],
         ...,
         [-0.9192, -0.9192, -0.4054,  ..., -0.1657, -1.0390, -0.8849],
         [-0.6965, -0.3712, -0.3541,  ...,  0.1939, -0.0801, -0.0458],
         [-0.4911, -0.6109,  0.0741,  ..., -0.3027,  0.0912, -0.1999]],

        [[ 0.1702,  0.3452,  0.6779,  ...,  0.2577,  0.2227, -0.1800],
         [-0.1800, -0.2675,  0.5553,  ...,  0.2752, -0.2500,  0.1877],
         [-0.3025,  0.0301,  0.3452,  ..., -0.7927, -1.1779, -0.4601],
         ...,
         [-0.8452, -1.0203, -0.3901,  ...,  0.1702, -0.9328, -0.9328],
         [-0.9153, -0.3200, -0.2850,  ...,  0.3627, -0.4426, -0.3375],
         [-0.4951, -0.4776,  0.1702,  ..., -0.2675, -0.1800, -0.4776]],

        [[ 0.0431, -1.0550, -0.9330,  ..., -0.5147, -0.7064, -0.8110],
         [-0.8110, -1.2990, -1.1770,  ...,  0

In [226]:
BATCH_SIZE = 4
train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=CustomCollate(pad_idx=en_vocab.stoi["<pad>"]),
    num_workers=0)

test__loader = DataLoader(
    dataset=train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=CustomCollate(pad_idx=en_vocab.stoi["<pad>"]),
    num_workers=0)


In [227]:
fun_en = np.vectorize(lambda x: en_vocab.itos[x])

In [228]:
total_steps = len(train_loader)
total_steps

9103

In [None]:
class Encode(nn.Modeule):
    def __init__(self, a);
        self.a = a