In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
import pandas as pd

In [None]:
df=pd.read_csv("/content/drive/MyDrive/laion_final.csv")

In [None]:
#Using First 100k rows
df=df.head(100000)

In [None]:
url_list=df['url'].tolist()
index_list=df.index.tolist()

In [None]:
import time
import requests
from multiprocessing import cpu_count
from multiprocessing.pool import ThreadPool

In [None]:
import os
import requests
from io import BytesIO
from PIL import Image

save_directory = '/content/downloaded_images/'
os.makedirs('/content/downloaded_images/', exist_ok=True)


def download_image(args):
    url,index=args[0],args[1]
    save_path=os.path.join(save_directory, f"image_{index}.jpg")
    try:
        response = requests.get(url,timeout=10)
        response.raise_for_status()
        image = Image.open(BytesIO(response.content))
        image.save(save_path)
#         print(f"Image downloaded from {url} and saved to {save_path}")
    except Exception as e:
        print(f"Error downloading image from {url}: {e}")



In [None]:
#defining function to download images in parallel
def download_parallel(args):
    cpus=cpu_count()
    results=ThreadPool(cpus).imap_unordered(download_image,args)

In [None]:
t0=time.time()
inputs=zip(url_list,index_list)
download_parallel(inputs)
print("Total download time:",time.time()-t0)

In [None]:
import os

def count_files_in_folder(folder_path):
    try:
        file_count = len([f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))])
        print(f"Number of files in '{folder_path}': {file_count}")
    except Exception as e:
        print(f"Error counting files in '{folder_path}': {e}")

count_files_in_folder("/content/kaggle/working/downloaded_images")

In [None]:
import torch
from PIL import Image
import numpy as np

#loading the model
model = torch.jit.load("/content/sscd_disc_mixup.torchscript.pt")
model=model.to('cuda')


In [None]:
from torchvision import transforms

normalize = transforms.Normalize(
    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225],
)
small_288 = transforms.Compose([
    transforms.Resize(288),
    transforms.ToTensor(),
    normalize,
])
skew_320 = transforms.Compose([
    transforms.Resize([320, 320]),
    transforms.ToTensor(),
    normalize,
])

In [None]:
import os
def get_sorted_files_by_index(folder_path):
    try:
        files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
        sorted_files = sorted(files, key=lambda x: int(x.split('_')[1].split('.')[0]))
        sorted_index=[int(index.split('_')[1].split('.')[0]) for index in sorted_files]
        return sorted_files,sorted_index
    except Exception as e:
        print(f"Error getting sorted files in '{folder_path}': {e}")
        return None

In [None]:
#getting sorted file and index from images downloaded
sorted_files,sorted_index = get_sorted_files_by_index("/content/kaggle/working/downloaded_images")

In [None]:
#Finding the embedding
embeddings=[]
for index in sorted_index:
    img = Image.open(f"/content/kaggle/working/downloaded_images/image_{index}.jpg").convert('RGB')
    batch = small_288(img).unsqueeze(0).to('cuda')
    embedding = model(batch)[0, :]
    embedding_arr=embedding.detach().cpu().numpy()
    embeddings.append(embedding_arr)
    del embedding
    if index % 1000 == 0:
      print(index)

In [None]:
#saving the embeddings
np.savez('embeddings.npz', *embeddings)
#saving the index
index_arr=np.array(sorted_index)
np.savez('index_arr.npz', data=index_arr)

In [None]:
import numpy as np

embeddings_loaded = np.load('/content/embeddings.npz')
index_loaded = np.load('/content/index_arr.npz')

In [None]:
embeddings_loaded = [embeddings_loaded[key] for key in embeddings_loaded]
index_loaded=index_loaded['data']
index_loaded=index_loaded.tolist()

In [None]:
embeddings_loaded_small=embeddings_loaded[0:10000]
index_loaded_small=index_loaded[0:10000]

In [None]:
#Finding the cosine distance between images matmul has been implemented to make the computation faster
import time
from numpy import dot
from numpy.linalg import norm
import numpy as np

def compute_cosine_sim(array_list):
    matrix = np.vstack(array_list)
    norm_arr=np.vstack([norm(arr) for arr in array_list])
    dot_norm=np.dot(norm_arr,norm_arr.T)
    dot_mat=np.dot(matrix,matrix.T)
    dist=dot_mat/dot_norm

    return dist

In [None]:
cosine_distances=compute_cosine_sim(embeddings_loaded_small)

In [None]:
np.savez('cosine_distances.npz', data=cosine_distances)

In [None]:
import numpy as np
cosine_distance_loaded = np.load('/content/cosine_distances.npz')
cosine_distance_loaded=cosine_distance_loaded['data']
cosine_distances=cosine_distance_loaded

In [None]:
index_dict = {index: element for index, element in enumerate(index_loaded_small)}

In [None]:
#Plotting the graph
import networkx as nx
import matplotlib.pyplot as plt


# Find the non-zero indices in the adjacency matrix
non_zero_indices = np.nonzero(adjacency_matrix)

#getting the cosine distance from the indices
edge_lengths=cosine_distances[[non_zero_indices[0]],[non_zero_indices[1]]]
edge_lengths=np.squeeze(edge_lengths)

#changing the index to old indices
non_zero_indices_0 = np.vectorize(index_dict.get)(non_zero_indices[0])
non_zero_indices_1 = np.vectorize(index_dict.get)(non_zero_indices[1])
non_zero_indices=(non_zero_indices_0,non_zero_indices_1)

mapped_edges = list(zip(non_zero_indices[0], non_zero_indices[1], edge_lengths))
G = nx.Graph()
G.add_weighted_edges_from(mapped_edges)

pos = nx.spring_layout(G,k=1)

plt.figure(figsize=(20, 12))

nx.draw(G, pos, with_labels=True, font_weight='bold', node_size=300, node_color='blue', font_color='black', font_size=5, edge_color='gray', linewidths=0.5, alpha=0.7)

edge_labels = {(mapped_edge[0], mapped_edge[1]): f"{mapped_edge[2]:.2f}" for mapped_edge in mapped_edges}
nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_color='red',font_size=6)

plt.savefig('graph.png')
plt.show()

In [None]:
clusters = list(nx.connected_components(G))
# Create a new graph with nodes colored based on clusters
node_to_cluster = {node: i for i, cluster in enumerate(clusters) for node in cluster}
node_colors = [node_to_cluster[node] for node in G.nodes]


pos = nx.spring_layout(G,k=1)
plt.figure(figsize=(20, 12))

nx.draw(G, pos, node_color=node_colors, cmap=plt.cm.get_cmap('viridis', len(clusters)), with_labels=True, font_weight='bold', node_size=300, font_color='black', font_size=8, edge_color='gray', alpha=0.7)
plt.title("Graph with Clusters Colored Differently")
plt.savefig('graph_with_cluster.png')
plt.show()

In [None]:
#getting all cluster prompts
for i in range(len(clusters)):

  list_of_indices=clusters[i]
  list_cluster = df.loc[list_of_indices, 'text'].tolist()

  with open("lists.txt", "a") as file:
    for item in list_cluster:
        file.write(item + "\n")
    file.write(f"\n\n##########################################Cluster{i}################################################\n\n")