In [138]:
from transformers import BertTokenizer,BertModel,pipeline
import torch
import torch.nn as nn
from torchvision.models import vit_b_16,ViT_B_16_Weights
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import GloVe,build_vocab_from_iterator
import math
from PIL import Image
import torchvision
from google import genai

In [54]:
device = torch.device('cpu')

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [3]:
model = BertModel.from_pretrained("bert-base-uncased")

In [23]:
gen = pipeline(model='openai-community/gpt2')

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Device set to use cpu


In [95]:
gen("The image consists of a white object which may be ",do_sample=False)[0]['generated_text'].split(' ')[:20]

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


['The',
 'image',
 'consists',
 'of',
 'a',
 'white',
 'object',
 'which',
 'may',
 'be',
 '\xa0a',
 'small,',
 'white',
 'object,',
 'or',
 'a',
 'large,',
 'white',
 'object.',
 'The']

In [31]:
weights = ViT_B_16_Weights.DEFAULT
preprocess = weights.transforms()

In [32]:
tokenizer =  get_tokenizer('basic_english')

In [33]:
glove_embedding = GloVe(name='6B',dim=50)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉| 399999/400000 [00:27<00:00, 14779.33it/s]


In [43]:
caption = []
with open('data/captions.txt','r') as f:
    for s in f.read().split('\n'):
        w = ''

        for c in s.split()[1:]:
            w  += c + " "

        caption.append(w)

In [44]:
caption[0]

"A man in street racer armor be examine the tire of another racer 's motorbike . "

In [45]:
def yield_tokens(cap):
    for c in cap:
        yield tokenizer(c)

In [47]:
vocab = build_vocab_from_iterator(yield_tokens(caption),specials=['<unk>'])
vocab.set_default_index(vocab['<unk>'])

In [63]:
class PositionalEncoding(nn.Module):
    def __init__(self,d_model,seq_len):
        super().__init__()
        self.dropout = nn.Dropout(0.1)
        pe = torch.zeros(seq_len,d_model)

        positions = torch.arange(0,seq_len,dtype=torch.float).unsqueeze(1)

        div_term = torch.exp(
            torch.arange(0,d_model,2).float() * (-math.log(10000)/d_model)
        )
        pe[:,0::2] = torch.sin(positions * div_term)
        pe[:,1::2] = torch.cos(positions * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer("pe",pe)

    def forward(self,x):
        x = x + self.pe
        x = self.dropout(x)
        return x

In [58]:
class LoRA(nn.Module):
    def __init__(self,rank,in_features,out_features,alpha):
        super().__init__()
        standard_deviation = 1/torch.sqrt(torch.tensor(rank).float())
        self.A = nn.Parameter(torch.randn(in_features,rank) * standard_deviation)
        self.B = nn.Parameter(torch.zeros(rank,out_features))
        self.alpha = alpha
        self.rank = rank

    def forward(self,x):
        return (self.alpha/self.rank) * (x@self.A@self.B)

In [72]:
class LoRADenseLayer(nn.Module):
    def __init__(self,linear,rank,alpha):
        super().__init__()
        self.linear = linear.to(device)
        in_features = self.linear.in_features
        out_features = self.linear.out_features
        self.lora = LoRA(rank,in_features,out_features,alpha)

    def forward(self,x):
        return self.linear(x) + self.lora(x)

In [73]:
vit_encoder = vit_b_16(weights=ViT_B_16_Weights.IMAGENET1K_V1)

In [74]:
for n in range(12):
    vit_encoder.encoder.layers[n].mlp[0] = LoRA_Dense(vit_encoder.encoder.layers[n].mlp[0],2,0.1)

In [83]:
class ImageCaptioningModel(nn.Module):
    def __init__(self,n_head,num_encoder_layers,num_decoder_layers,max_seq_len):
        super().__init__()
        self.vision_encoder = vit_encoder
        self.vision_encoder.head = nn.Identity() # Does'nt modify anything

        self.enc_projection = nn.Linear(1000,50)
        self.token_embedding  = nn.Embedding.from_pretrained(glove_embedding.vectors,freeze=True)
        self.d_model = 50
        self.positional_encoding = PositionalEncoding(self.d_model,max_seq_len)
        decoder_layer = nn.TransformerDecoderLayer(d_model=self.d_model,nhead=n_head,batch_first=True)
        self.decoder_transformer = nn.TransformerDecoder(decoder_layer,num_layers=num_decoder_layers)
        self.output_linear = nn.Linear(self.d_model,self.d_model)

    def forward(self,img,cap):
        enc_out = self.vision_encoder(img)
        caption_embeddings = self.token_embedding(cap) * math.sqrt(self.d_model) + self.positional_encoding(cap)
        mask = nn.Transformer.generate_square_subsequent_mask(50,device=device)

        decoder_output = self.decoder_transformer(
            tgt=caption_embeddings,
            memory=self.enc_projection(enc_out).unsqueeze(0),
            tgt_mask=mask.unsqueeze(0).expand(50,-1,-1)
        )
        output = self.output_linear(decoder_output)
        return output

In [84]:
model = ImageCaptioningModel(50,6,6,50).to(device)

In [85]:
model.state_dict = torch.load('model/model-adam.pt',map_location=device)

In [92]:
resize_obj = torchvision.transforms.Resize((216,216))

In [100]:
def predict(img_path):
    img = Image.open(f"{img_path}").convert('RGB')
    img = resize_obj(img)
    inp =  preprocess(img).unsqueeze(0).to(device)
    cap = torch.tensor(vocab(tokenizer("<start>")),dtype=torch.int64).unsqueeze(0).to(device)
    print(inp.shape)
    print(cap.shape)
    out = model(inp,cap)
    predicted = torch.argmax(out,dim=-1)
    #predicted = torch.max(out.data,1)[1]
    text = [vocab.lookup_token(idx) for idx in predicted[0]]
    text = set(text)
    return text
    #return cap

In [122]:
text = list(predict('image.jpg'))

torch.Size([1, 3, 224, 224])
torch.Size([1, 1])


In [124]:
text

['blue', 'over']

In [133]:
w = ''

In [134]:
for i in range(len(text)):
    if i == len(text) - 1:
        
        w += text[i]
    else:
        w += text[i] + ' and '

In [135]:
w

'blue and over'

In [140]:
out = gen("This image has characteristics of " + w +"could you tell something about it",do_sample=False)[0]['generated_text'].split(' ')[:20]

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [142]:
pred = ''
for t in out:
    pred += ' ' + t

In [143]:
pred

' This image has characteristics of blue and overcould you tell something about it?\n\nThe image is a composite of two images.'

In [144]:
user_ques = input('Your question')

Your question do you think it is a scenery


In [139]:
client = genai.Client(api_key='AIzaSyCFJ3RwiHvLTy9QYMhraasRH1D3h7zZ2G0')

In [145]:
response = client.models.generate_content(
    model="gemini-2.0-flash",
    contents=pred + user_ques,
)

In [146]:
response.text

'Okay, based on your description and the assumption that we\'re dealing with a *composite* image with blue and overcast characteristics, and that you are asking about the possibility of it being a scenery image, here\'s a breakdown of what we can infer:\n\n**What the Characteristics Suggest:**\n\n*   **Blue:** The presence of blue suggests several possibilities. It could be the sky, water (like a lake, sea, or river), or even blue-toned objects in the scene. If it\'s prominent, it might indicate a clear day (although "overcast" contradicts this somewhat) or a specific mood created by the blue.\n\n*   **Overcast:** "Overcast" directly implies a cloudy sky. This means the lighting would likely be soft, diffused, and lack strong shadows. Colors would be muted compared to a sunny day.\n\n**Composite Nature and Scenery Possibility:**\n\n*   **Composite Image:** The fact that it\'s a composite is crucial. It means elements from different images have been combined. This opens a wide range of 