In [1]:
from ultralytics import YOLO
import os
import cv2 
import pandas 
import torch
import matplotlib.pyplot as plt
#import matplotlib.image as mpimg
import networkx as nx
from tensorflow.keras.applications.vgg19 import VGG19, preprocess_input
from tensorflow.keras.preprocessing.image import load_img,img_to_array
from tensorflow.keras.models import Model

In [2]:
imagePath=r"D:\\Paper\\Mimic Human Level Intelligence in Image Descriptioning\\Flicker8k_Dataset\\109202801_c6381eef15.jpg"

In [3]:
#Object Detection using YOLOv8
def detect_objects(imagePath):
    image=cv2.imread(imagePath)
    image=cv2.resize(image,(256,256))
    model = torch.hub.load('ultralytics/yolov5', 'yolov5s')
    result=model(image)
    output=result.pandas().xyxy[0]
    return output #returns the Dataframe

In [4]:
output_df=detect_objects(imagePath)

Using cache found in C:\Users\Riddhick/.cache\torch\hub\ultralytics_yolov5_master
YOLOv5  2024-2-22 Python-3.9.5 torch-2.2.0+cpu CPU

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 


In [5]:
print(output_df)

         xmin       ymin        xmax        ymax  confidence  class    name
0  116.701477  45.346382  151.785828   73.473587    0.762813      0  person
1  159.475128  23.756115  253.407791  246.825394    0.594860     17   horse
2   24.415497  25.157923  128.289352  239.877441    0.394639     17   horse


In [6]:
print(output_df.shape[0])

4


In [6]:
def extract_nodes(image):
    output_df=detect_objects(image)
    image_nodes=[]
    for i in range(output_df.shape[0]):
        #print(output_df.iloc[i]['name'])
        data={"object_id":i,
        "start_point":(round(output_df.iloc[i]['xmin']),round(output_df.iloc[i]['ymin'])),
        "ending_point":(round(output_df.iloc[i]['xmax']),round(output_df.iloc[i]['ymax'])),
        "label":output_df.iloc[i]['name']
        }
        image_nodes.append(data)
    return image_nodes    

In [11]:
def generate_nodeImages(imagePath,nodes):
    image=cv2.imread(imagePath)
    image=cv2.resize(image,(256,256))
    #image = cv2.rectangle(image, (106,19), (188,133), (255, 0, 0) , 2) 
    #crop=image[19:133,106:188]
    #cv2.imshow('image',crop)
    #cv2.waitKey()
    segment_array=[]
    for i in range(len(nodes)):
        start=nodes[i].get('start_point')
        end=nodes[i].get('ending_point')
        #print(end[1])
        a,b,c,d=start[1],end[1],start[0],end[0]
        crop=image[a:b,c:d]
        segment_array.append(crop)
    segment_array.append(image)    
    return segment_array    

In [12]:
nodes=extract_nodes(imagePath)
print(nodes)
#segment_array=generate_nodeImages(imagePath,nodes)
#visualize_graph(segment_array)

Using cache found in C:\Users\Riddhick/.cache\torch\hub\ultralytics_yolov5_master
YOLOv5  2024-2-22 Python-3.9.5 torch-2.2.0+cpu CPU

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 


[{'object_id': 0, 'start_point': (117, 45), 'ending_point': (152, 73), 'label': 'person'}, {'object_id': 1, 'start_point': (159, 24), 'ending_point': (253, 247), 'label': 'horse'}, {'object_id': 2, 'start_point': (24, 25), 'ending_point': (128, 240), 'label': 'horse'}]


In [13]:
segment_array=generate_nodeImages(imagePath,nodes)

In [14]:
len(segment_array)

4

In [15]:
#load the VGG19 model:
model=VGG19()
model=Model(inputs=model.inputs,outputs=model.layers[-2].output)
print(model.summary())

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0     

In [16]:
def featuresVGG(segment_array):
    VGG_features=[]
    for image in segment_array:
        image=cv2.resize(image,(224,224))
        image=img_to_array(image)
        image=image.reshape((1,image.shape[0],image.shape[1],image.shape[2]))
        image=preprocess_input(image)
        feature=model.predict(image,verbose=0)
        VGG_features.append(feature)
    return VGG_features    

In [17]:
VGG=featuresVGG(segment_array)

In [18]:
VGG[0].shape

(1, 4096)

In [19]:
print(VGG[0][0])

[     1.3155           0           0 ...      1.0103      1.7103           0]


In [20]:
print(len(VGG))

4


In [21]:
#GCN using Relational Graph Convolution

import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
from dgl.nn.pytorch.conv import RelGraphConv
import networkx as nx
import matplotlib.pyplot as plt


g = dgl.DGLGraph()

# Add nodes to the graph
num_nodes = len(VGG)
g.add_nodes(num_nodes)

# Create tensor for node features
features = torch.tensor(VGG, dtype=torch.float32)

# Add features to the graph
g.ndata['features'] = features

# Add edges to the graph with a complete graph
num_edges = num_nodes * (num_nodes - 1)
g.add_edges(torch.randint(num_nodes, (num_edges,)), torch.randint(num_nodes, (num_edges,)))

# Initialize learnable edge weights
g.edata['edge_weights'] = nn.Parameter(torch.rand(num_edges, requires_grad=True))

# GCN model with RelGraphConv layer
class GCN(nn.Module):
    def __init__(self, in_feats, hidden_feats, out_feats, num_rels):
        super(GCN, self).__init__()
        self.layer = RelGraphConv(in_feats, hidden_feats, num_rels, activation=F.relu)
        #self.out_layer = nn.Linear(hidden_feats, out_feats)

    def forward(self, g, features):
        g = dgl.remove_self_loop(g)
        # Perform relational graph convolution
        h = self.layer(g, features, g.edata['edge_weights'])
        
        # fully connected layer
        #output = self.out_layer(h)
        return h

# Reshape the features tensor
features = features.squeeze(1)

# Instantiate the GCN model with the number of relations and classes
in_feats = features.shape[1]  # Number of input features
hidden_feats = 256  # Number of hidden units
num_classes = 5  # Number of classes for multiclass classification
num_rels = 10  # Number of relations 

model = GCN(in_feats, hidden_feats, num_classes, num_rels) 

# Forward pass
output = model(g, features)
print(output)
output.shape
# Apply softmax to obtain class probabilities
#probs = F.softmax(output, dim=1)

#print(probs)


tensor([[1.53010, 2.92149, 1.50596,  ..., 1.09558, 0.00000, 3.36610],
        [0.00000, 0.00000, 0.00000,  ..., 0.00000, 1.14462, 2.79518],
        [1.84861, 2.14890, 0.00000,  ..., 0.00000, 0.00000, 2.29957],
        [7.25546, 0.00000, 2.02446,  ..., 0.00000, 0.00000, 0.00000]], grad_fn=<ReluBackward0>)


torch.Size([4, 256])

In [22]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [122]:
# GCN model using convetional Convulution Graph Network
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
from dgl.nn import GraphConv

g = dgl.DGLGraph()

# Add nodes to the graph
num_nodes = len(VGG)
g.add_nodes(num_nodes)

# Create tensor for node features
features = torch.tensor(VGG, dtype=torch.float32)

# Add features to the graph
g.ndata['features'] = features

# Add edges to the graph with learnable weights initialized randomly
num_edges = num_nodes*num_nodes-1 
g.add_edges(torch.randint(num_nodes, (num_edges,)), torch.randint(num_nodes, (num_edges,)))

# Initialize learnable edge weights
g.edata['edge_weights'] = nn.Parameter(torch.rand(num_edges, requires_grad=True))

# Define a Graph Convolutional Network (GCN) model
class GCN(nn.Module):
    def __init__(self, in_feats, hidden_feats):
        super(GCN, self).__init__()
        self.conv = GraphConv(in_feats, hidden_feats)

    def forward(self, g, features):
        # Perform graph convolution with learnable edge weights
        h = self.conv(g, features)
        h = F.relu(h)
        return h

# Reshape the features tensor
features = features.squeeze(1)

# Instantiate the GCN model
in_feats = features.shape[1] # Number of input features
hidden_feats = 256 # Number of hidden units
model = GCN(in_feats, hidden_feats)

# Forward pass
output_gcn = model(g, features)


print(output_gcn)
output_gcn.shape

tensor([[0.00000, 1.38239, 0.00000,  ..., 0.76408, 0.00000, 0.18507],
        [0.00000, 4.20069, 0.96631,  ..., 1.06541, 0.00000, 0.63017],
        [0.00000, 4.15923, 1.60207,  ..., 0.37969, 0.00000, 0.66038],
        [0.00000, 2.12240, 0.62677,  ..., 1.03850, 0.00000, 0.56967]], grad_fn=<ReluBackward0>)


torch.Size([4, 256])

In [123]:
import numpy as np
from torch import Tensor
output=output_gcn.detach().numpy()
#output=np.reshape(output_gcn,output_gcn.shape[1])

In [124]:
output.shape


(4, 256)

In [48]:
#b = output.flatten()
#b=np.reshape(b,b.shape[0])

In [49]:
print(b)

[          0     0.30215      0.6291 ...           0           0     0.72753]


In [76]:
len(b)

1

In [125]:
output=output.transpose()

In [98]:
b=output.reshape(1,-1)

In [99]:
b.shape

(1, 1024)

In [100]:
print(b)

[[    0.50786           0     0.28418 ...      1.3954      5.5616           0]]


In [33]:
#b=np.reshape(b,b.shape[1])

In [34]:
#b.shape

(1024,)

In [126]:
 from sklearn.decomposition import PCA

In [128]:
pca = PCA(n_components=1)
pca.fit(output)

PCA(n_components=1)

In [118]:
pca_b

PCA(n_components=1)

In [129]:
from sklearn.preprocessing import StandardScaler
s=StandardScaler()
x=pca.transform(output)

In [130]:
print(x.shape)

(256, 1)


In [134]:
print(x)

[[    -1.2616]
 [     5.1594]
 [    0.57578]
 [    -1.2616]
 [    0.33297]
 [     0.2496]
 [    -1.0775]
 [    -1.2616]
 [     1.6534]
 [     0.2691]
 [    -0.7258]
 [    -1.1917]
 [   -0.95797]
 [    0.51899]
 [    0.18361]
 [   -0.26904]
 [    -1.2616]
 [    -1.0818]
 [     1.6243]
 [      2.012]
 [    -1.2616]
 [     1.1576]
 [     5.0555]
 [    -1.2616]
 [    -1.0644]
 [     1.3706]
 [    -1.2616]
 [    -1.0471]
 [   -0.66128]
 [     0.3533]
 [    -1.2616]
 [    -1.0162]
 [    -0.5418]
 [      4.545]
 [    -1.2616]
 [    -1.2616]
 [     4.3435]
 [    -1.2173]
 [     4.0678]
 [    -1.2616]
 [    -1.1928]
 [     1.2638]
 [     2.0019]
 [    -1.2616]
 [    -1.2616]
 [   -0.32235]
 [    -1.2616]
 [    -1.2616]
 [     3.1066]
 [    -1.2616]
 [     1.6483]
 [    -1.2616]
 [     1.6423]
 [   -0.89371]
 [   -0.66018]
 [    -1.2616]
 [   -0.72212]
 [    -1.2616]
 [    -1.1859]
 [    -1.2153]
 [  -0.020354]
 [    -1.2616]
 [    -1.2616]
 [   -0.75988]
 [    -1.2616]
 [    0.98595]
 [    -1.2

In [55]:
#output=np.reshape(output,output.shape[1])

In [45]:
#Reshape the output of GCN for LSTM
batch_size = 1  #processing one graph at a time
num_nodes = g.number_of_nodes()
output_gcn = output_gcn.view(batch_size, num_nodes, hidden_feats)

# LSTM model
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(LSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)

    def forward(self, x):
        output, _ = self.lstm(x)
        return output


lstm_model = LSTM(hidden_feats, 128)  # 128 hidden units 

# Forward pass 
output_lstm = lstm_model(output_gcn)


print(output_lstm)
output_lstm.shape

tensor([[[ 1.31777e-01, -2.08447e-02, -1.52904e-01, -1.32896e-01,  1.32941e-01,  1.34144e-01,  4.48199e-02,  1.76426e-01,  2.47702e-02, -1.40492e-01,  3.97684e-02, -1.07978e-01,  1.80426e-01, -1.14290e-02,  1.53168e-01,  7.91234e-02,  1.41230e-01, -2.26516e-01,  1.58732e-01, -1.00209e-01, -2.11133e-01, -1.34866e-01,
          -1.56171e-01,  1.26189e-01,  2.00407e-01, -1.26752e-01, -1.42292e-02,  1.49252e-01, -3.14804e-02, -1.07960e-01,  1.55362e-01,  1.04021e-01,  2.98855e-02,  2.33520e-02,  1.03485e-01,  1.86764e-01,  7.54420e-02,  7.36205e-02,  9.82403e-02, -7.73489e-02,  1.24779e-02, -2.84458e-02,  2.23195e-03,  7.30309e-02,
          -6.64143e-02, -4.31541e-02,  1.67844e-01,  3.06843e-02,  3.03529e-02,  9.53403e-02,  9.44292e-02,  5.36023e-02, -5.60734e-02,  1.01189e-01, -4.24863e-03,  1.78458e-02, -2.46704e-01,  2.94786e-02, -2.09040e-01,  1.81564e-01,  2.08055e-01,  2.09826e-02, -8.68400e-02,  1.16171e-01, -1.47190e-02, -9.32746e-02,
           2.64793e-01, -6.98123e-02, -1.87539

torch.Size([1, 3, 128])