In [1]:
import pandas as pd
import re

In [2]:
# File generated with image_to_csv.py
df_img=pd.read_csv(r"C:/Users/gar43/OneDrive/Documents/DataChallenge/dataset_types.csv")

In [3]:
# IDs are the last numbers in image_name
def find_id(x):
    pattern = r"_([\d]+)_[a-zA-Z]\.jpg"
    return re.search(pattern, x).group(1)

In [4]:
df_img['ID'] = df_img['image_name'].apply(lambda x: find_id(x))

In [5]:
# Unfortunately IDs are not unique, 
# but we need them to match with coin discriptions.
df_img = df_img.drop_duplicates(subset=['ID'])

In [6]:
# Coin descriptions.
df_descrip=pd.read_csv(r"C:/Users/gar43/OneDrive/Documents/DataChallenge/CN_coin_descriptions.csv", header=None)

In [None]:
df_descrip

In [8]:
# Split thw whole string in dataframe and find the length of the resulted list
a =df_descrip[0].apply(lambda x:x.split(',"'))
m=a.apply(lambda x: len(x))

In [9]:
# Find incorrect and incomplete data records
m[m<3].index

Int64Index([ 3850,  7495,  8396,  8398,  8400, 12908, 17846, 17847, 19138,
            23813, 23814, 23988, 31815, 31816, 33763, 33764, 33767, 33768,
            33769, 33770, 33774, 33775, 33777, 33778, 35085, 35087, 35089,
            35091, 35093, 35095, 40142],
           dtype='int64')

In [10]:
# Examples of incorrect strings
print(a[35095])
print(a[33775])

['\u2002"']
['\u2002"', 'Forepart of winged horse, right."']


In [11]:
# Drop incorrect data recrords 
a = a.drop(m[m<3].index, axis=0)
df_descrip = df_descrip.drop(m[m<3].index, axis=0)

In [12]:
df_descrip['ID'] = [x[0] for x in a]
df_descrip['obverse '] = [x[1] for x in a]
df_descrip['reverse '] = [x[2] for x in a]
df_descrip=df_descrip.drop(columns=0)

In [13]:
# Check for duplicates
duplicates = df_descrip[df_descrip['ID'].duplicated()]['ID'].tolist()
duplicates

[]

In [None]:
df_merged = df_descrip.merge(df_img, on='ID', how='left')
df_merged

In [None]:
# Not all classes/types have descriprion
df_merged = df_merged.dropna()
df_merged['class'] = df_merged['class'].astype('int')
# Manually correct datapoint
df_merged['class'][0] = 3987

In [16]:
img_csv = df_merged[['filename', 'image_name', 'class']].reset_index(drop=True)

In [17]:
# Exract unique descriptions for promts
df_for_promts = df_merged.drop_duplicates(subset=['class'])

In [None]:
begin = 'Coin Obverse shows '
end = 'Reverse shows '
df_for_promts['promt'] = begin \
                        + df_for_promts['obverse '] \
                        + end \
                        + df_for_promts['reverse ']

In [None]:
# Drop all descriptions that have more/equal than 77 tokens, because 77 tokens are max for clip
import clip_clip
df_for_promts['len'] = df_for_promts['promt'].apply(lambda x: clip_clip.tokenize_len(x))
df_for_promts_77 = df_for_promts.drop(df_for_promts[df_for_promts['len']>76].index)

# Drop also from img_csv
more_than_76_promts = df_for_promts[df_for_promts['len']>76]
img_csv = img_csv[~img_csv['class'].isin(more_than_76_promts['class'])]

In [20]:
# Convert IDs to 0 to N for one hot encoding
df_for_promts_77['class_id'] = df_for_promts_77['class'].rank(method='dense').astype(int) - 1
img_csv =img_csv.merge(df_for_promts_77[['class', 'class_id']], on='class', how='left')

In [21]:
# Write an input file
img_csv[['filename', 'image_name', 'class_id']].to_csv('clip_img_77_id.csv', index=False)

In [22]:
import clip
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from tqdm import tqdm
# The CPU should be sufficient for this task
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [33]:
len(img_csv['class_id'].unique())

9178

In [23]:
model, preprocess = clip.load("ViT-L/14@336px", device=device, download_root=None)
model.eval()

# This also provides a useful preprocessing pipeline for the images
preprocess

Compose(
    Resize(size=336, interpolation=bicubic, max_size=None, antialias=warn)
    CenterCrop(size=(336, 336))
    <function _convert_image_to_rgb at 0x000002178AB4B250>
    ToTensor()
    Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
)

In [27]:
from CoinDataset_clip import CoinDataset
images = CoinDataset('clip_img_77_id.csv', preprocess, 9178 )
img_dataloader = DataLoader(images, batch_size=8, shuffle=False, num_workers=6)
for i, batch in enumerate(img_dataloader):
    x, y = batch["image"], batch["label"]
    print(x.shape, y.shape)
    break
print("data loaded")

torch.Size([8, 3, 336, 336]) torch.Size([8, 9178])
data loaded


In [28]:
# List with promts
promts = df_for_promts_77['promt'].to_list()

In [30]:
# Tokenize the prompts with `clip.tokenize`
tokenized = torch.cat([clip.tokenize(c) for c in promts]).to(device)

#encoder is on gpu so we have to put inputs to gpu
tokenized = tokenized.to(device)
# The result is a tensor of shape (1024, 10),
# since we have 10 classes and the feature dimension of the text encoder is 1024
# we don't want to calculate gradient during evaluation

text_embedding = torch.zeros((768,9178)).to(device)
with torch.no_grad():
    for i in range(90):
        text_embedding[:,i*100:i*100+100] = model.encode_text(tokenized[i*100:i*100+100,:]).permute(1,0)
with torch.no_grad():
    text_embedding[:,9000:9179] = model.encode_text(tokenized[9000:9179,:]).permute(1,0)   

In [31]:
text_embedding.shape

torch.Size([768, 9178])

In [34]:
############################################################
##                   START OF YOUR CODE                   ##
############################################################
correct = 0
total = 25613

with torch.no_grad():
    #1. Loop over the dataset and put stuff to gpu cause our model is on gpu
    for i, batch in enumerate(img_dataloader):
        
        if i%500 ==0:
            print(i)
        inputs = batch['image'].to(device)
        labels = batch['label'].to(device)
        
        
        # 2. Create visual embeddings with the image encoder
        visual_embedding = model.encode_image(inputs)
        
        # 3. Calculate the cosine similarity between the image and text embeddings (Note Images are batched from Dl)
        cosine_similarity = visual_embedding @ text_embedding.half()
        
        # image norms along dim 1 because visual embedding has shape [batch, embbeding_size]
        image_norms = torch.norm(visual_embedding, p=2, dim=1).unsqueeze(1) 
        # text norms along dim 0 because it has shape [embedding size, num_classes]
        text_norms = torch.norm(text_embedding, p=2, dim=0).unsqueeze(0)
        
        #because of unsquezzing before we have: image_norms[64,1] and text_norms[1,10] and with the help of broadcasting we get 
        #all relevant norm products
        cosine_similarity = cosine_similarity / (image_norms * text_norms)

        #prediction is equal to the position of highest cosine similarity
        preds = torch.argmax(cosine_similarity, dim=1)
        
        #_, top5_indices = torch.topk(cosine_similarity, 5, dim=1)
        labels = torch.argmax(labels, dim = 1) 

        # Increment the number of correct predictions based on the comparison between the predicted and actual labels
        correct += sum(preds == labels)
        #correct += sum(torch.sum(top5_indices == labels.unsqueeze(1), dim=1))

        
        
print(f"Accuracy: {correct / total: .3f}")
############################################################
##                    END OF YOUR CODE                    ##
############################################################

0
500
1000
1500
2000
2500
3000
Accuracy:  0.000
