###### -----------------START--------------------------------------------

In [1]:
import json

In [2]:
import os

In [3]:
from tqdm import tqdm

In [4]:
train_file_path = '/home/aritra/cric/train_questions.json'
val_file_path = '/home/aritra/cric/val_questions.json'
test_file_path = '/home/aritra/cric/test_v1_questions.json'

In [5]:
# Training Set

with open(train_file_path, "r") as file:
     train_json = json.load(file)

In [6]:
# Validation Set

with open(val_file_path, "r") as file:
     val_json = json.load(file)

In [7]:
# Test Set

with open(test_file_path, "r") as file:
     test_json = json.load(file)

In [8]:
len(train_json)

365235

In [9]:
len(val_json)

43112

In [10]:
len(test_json)

86003

In [11]:
train_json[1099]['question']

'which brown animal walking in the field could be used for transporting people'

In [12]:
val_json[1099]['question']

'is there an object that is a type of public transports'

In [13]:
test_json[1099]['question']

'can the ceramic bird spread wings'

### ------------------------------Extracting Data of Training Set-------------------------------------------------------------------------------



In [14]:
questionList = []
answerList = []
imgList = []

In [15]:
train_json[2]['image_id']

1005

#### iter 1: from 0 , 149000 -> error1.txt -> 159
#### iter 2: from 150000 , 240000 -> error2.txt -> 34
#### iter 3: from 240000 , 365235 ->error3.txt -> 121

In [16]:
# verifying
indexToExclude = []

with open('error1.txt', 'r') as file:
    for line in file:
        number = int(line.strip())
        indexToExclude.append(number)
        
with open('error2.txt', 'r') as file:
    for line in file:
        number = int(line.strip())
        indexToExclude.append(number)
        
with open('error3.txt', 'r') as file:
    for line in file:
        number = int(line.strip())
        indexToExclude.append(number)

In [17]:
len(indexToExclude)

314

In [18]:
for i in tqdm(range(len(train_json))):
    
    if i in indexToExclude:
        continue
        
    pointer = train_json[i]
    
    questionList.append(pointer['question'])
    answerList.append(pointer['answer'])
    imgList.append(pointer['image_id'])

100%|███████████████████████████████████| 365235/365235 [00:01<00:00, 313778.01it/s]


In [19]:
len(questionList), len(answerList), len(imgList)

(364921, 364921, 364921)

In [20]:
len(questionList), len(answerList), len(imgList)

(364921, 364921, 364921)

In [21]:
len(list(set(answerList)))

1442

### ---------------------------------------Map Creation--------------------------------------------------------

In [22]:
def findUnique(targetList):
    
    uniqueList = []
    
    for word in targetList:
        if word not in uniqueList:
            uniqueList.append(word)
    
    return uniqueList

In [23]:
len(findUnique(answerList))

1442

In [24]:
# creating word to number mapping

mapping = {}
counter = 0

uniqueAnsList = findUnique(answerList)

for word in uniqueAnsList:
    
    if word not in mapping:
        
        mapping[word] = counter
        counter += 1

In [25]:
uniqueAnsList[0:5]

['no', 'small', 'picture', 'table', 'bookshelf']

In [26]:
numOfClasses = max(mapping.values())
numOfClasses

1441

In [27]:
len(mapping)

1442

In [28]:
# creating number to word mapping

reverse_mapping = dict([(value, key) for key, value in mapping.items()])

### --------------------------------------Processing of Training Set--------------------------------------------------------------------

In [29]:
labels = []

for i in range(len(answerList)):
    labels.append( mapping[ answerList[i] ] )

In [30]:
len(labels)

364921

In [31]:
scores = []

for i in tqdm(range(len(answerList))):
    
    s = [0] * (numOfClasses+1)
    s[ mapping[ answerList[i]] ] = 1
    
    scores.append(s)

100%|████████████████████████████████████| 364921/364921 [00:04<00:00, 90998.65it/s]


In [32]:
len(scores)

364921

In [33]:
imgPathList = []
filepath = '/home/aritra/cric/images/img/'

for i in tqdm(range(len(imgList))):
    
    imgName = str(imgList[i]) + '.jpg'
    concatedPath = os.path.join(filepath,imgName)
    
    imgPathList.append(concatedPath)

100%|███████████████████████████████████| 364921/364921 [00:00<00:00, 776738.89it/s]


In [34]:
from datasets import load_dataset
from datasets import Dataset
import datasets
from PIL import Image
import torch

In [35]:
imgPathList[0:5]

['/home/aritra/cric/images/img/1000.jpg',
 '/home/aritra/cric/images/img/1005.jpg',
 '/home/aritra/cric/images/img/1005.jpg',
 '/home/aritra/cric/images/img/1005.jpg',
 '/home/aritra/cric/images/img/1008.jpg']

In [36]:
len(imgPathList)

364921

In [37]:
listToDictionary = {'questions':questionList, 'labels': labels, 'scores': scores, 'images':imgPathList}
modified_train_set = Dataset.from_dict(listToDictionary)

In [38]:
# mapping each filepath to images in the directory

modified_train_set = modified_train_set.cast_column("images", datasets.Image())

In [39]:
modified_train_set

Dataset({
    features: ['questions', 'labels', 'scores', 'images'],
    num_rows: 364921
})

### ------------------------------------------------Extracting Validation Set---------------------------------------------

In [40]:
questionList_val = []
answerList_val = []
imgList_val = []

In [41]:
# collecting the index containing errorneous images

indexToExcludeVal = []
with open('error_validation.txt', 'r') as file:
    for line in file:
        number = int(line.strip())
        indexToExcludeVal.append(number)

with open('error_validation2.txt', 'r') as file:
    for line in file:
        number = int(line.strip())  # Convert the read line to an integer
        indexToExcludeVal.append(number)


In [42]:
# excluding the index containing errorneous images

for i in tqdm(range(len(val_json))):
    
    if (i in indexToExcludeVal):
        continue
        
    pointer = val_json[i]
    
    questionList_val.append(pointer['question'])
    answerList_val.append(pointer['answer'])
    imgList_val.append(pointer['image_id'])

100%|██████████████████████████████████████| 43112/43112 [00:02<00:00, 16142.43it/s]


43112 -> 43068 -> 33175

In [43]:
len(questionList_val), len(answerList_val), len(imgList_val)

(33175, 33175, 33175)

In [44]:
uniqueAnswerListVal = list(set(answerList_val))
len(uniqueAnswerListVal)

266

In [45]:
# check if all the uniques answers are present in the mapping

y,n = 0,0
store = []
for i in range(len(answerList_val)):
    
    word = answerList_val[i]
    
    if word in mapping:
        y += 1
    else:
        n+=1
        store.append(i)

In [46]:
y

33175

### --------------------------------------------------------Processing Validation Set-------------------------------------------------------

In [47]:
labels_val = []

for i in range(len(answerList_val)):
    labels_val.append( mapping[ answerList_val[i] ] )

In [48]:
len(labels_val)

33175

In [49]:
scores_val = []

for i in tqdm(range(len(answerList_val))):
    
    s = [0] * (numOfClasses+1)
    s[ mapping[ answerList_val[i]] ] = 1
    
    scores_val.append(s)

100%|██████████████████████████████████████| 33175/33175 [00:00<00:00, 91852.72it/s]


In [50]:
len(scores_val)

33175

In [51]:
imgPathList_val = []
filepath = '/home/aritra/cric/images/img/'

for i in tqdm(range(len(imgList_val))):
    
    imgName = str(imgList_val[i]) + '.jpg'
    concatedPath = os.path.join(filepath,imgName)
    
    imgPathList_val.append(concatedPath)

100%|█████████████████████████████████████| 33175/33175 [00:00<00:00, 773570.73it/s]


In [52]:
imgPathList_val[0:5]

['/home/aritra/cric/images/img/1003.jpg',
 '/home/aritra/cric/images/img/1003.jpg',
 '/home/aritra/cric/images/img/1018.jpg',
 '/home/aritra/cric/images/img/1018.jpg',
 '/home/aritra/cric/images/img/1027.jpg']

In [53]:
# creating HF dataset to map images fast of Val_set

listToDictionary = {'questions':questionList_val, 'labels':labels_val, 'scores':scores_val, 'images':imgPathList_val}
modified_val_set = Dataset.from_dict(listToDictionary)

In [54]:
# mapping each filepath of Val Set to images in the directory

modified_val_set = modified_val_set.cast_column("images", datasets.Image())

### -------------------------------------------Extracting Test Set-------------------------------------------------


In [55]:
questionList_test = []
answerList_test = []
imgList_test = []

In [56]:
indexToExcludeTest = []

with open('error_testSet1.txt', 'r') as file:
    for line in file:
        number = int(line.strip())
        indexToExcludeTest.append(number)
        
with open('errorTestSet2.txt', 'r') as file:
    for line in file:
        number = int(line.strip())
        indexToExcludeTest.append(number)

In [57]:
len(indexToExcludeTest)

14150

In [58]:
for i in tqdm(range(len(test_json))):
    
    if i in indexToExcludeTest:
        continue
        
    pointer = test_json[i]
    
    questionList_test.append(pointer['question'])
    answerList_test.append(pointer['answer'])
    imgList_test.append(pointer['image_id'])

100%|██████████████████████████████████████| 86003/86003 [00:07<00:00, 11040.24it/s]


86003 -> 71863

### -------------------------------------- Processing Test Set ----------------------------------------------------------------------------

In [59]:
# check if all the uniques answers are present in the mapping

y,n = 0,0
store = []
for i in range(len(answerList_test)):
    
    word = answerList_test[i]
    
    if word in mapping:
        y += 1
    else:
        n+=1
        store.append(i)

In [60]:
y

71863

In [61]:
labels_test = []

for i in range(len(answerList_test)):
    labels_test.append( mapping[ answerList_test[i] ] )

In [62]:
len(labels_test)

71863

In [63]:
scores_test = []

for i in tqdm(range(len(answerList_test))):
    
    s = [0] * (numOfClasses+1)
    s[ mapping[ answerList_test[i]] ] = 1
    
    scores_test.append(s)

100%|██████████████████████████████████████| 71863/71863 [00:00<00:00, 90939.42it/s]


In [64]:
len(scores_test)

71863

In [65]:
imgPathList_test = []
filepath = '/home/aritra/cric/images/img/'

for i in tqdm(range(len(imgList_test))):
    
    imgName = str(imgList_test[i]) + '.jpg'
    concatedPath = os.path.join(filepath,imgName)
    
    imgPathList_test.append(concatedPath)

100%|█████████████████████████████████████| 71863/71863 [00:00<00:00, 773385.17it/s]


In [66]:
len(imgPathList_test)

71863

In [67]:
imgPathList_test[0:5]

['/home/aritra/cric/images/img/1004.jpg',
 '/home/aritra/cric/images/img/1004.jpg',
 '/home/aritra/cric/images/img/1004.jpg',
 '/home/aritra/cric/images/img/1004.jpg',
 '/home/aritra/cric/images/img/1004.jpg']

In [68]:
# creating HF dataset to map images fast of test_set

listToDictionary = {'questions':questionList_test, 'labels':labels_test, 'scores':scores_test, 'images':imgPathList_test}
modified_test_set = Dataset.from_dict(listToDictionary)

In [69]:
# mapping each filepath of test Set to images in the directory

modified_test_set = modified_test_set.cast_column("images", datasets.Image())

### -------------------------------End of Processing----------------------------------------------------------------------------

In [70]:
from transformers import ViltProcessor, ViltForQuestionAnswering

In [71]:
from transformers import ViltConfig
config = ViltConfig.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

In [72]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [73]:
pwd

'/home/aritra'

In [74]:
processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-mlm")

In [87]:
model = ViltForQuestionAnswering.from_pretrained("model_chkpts/test/vilt_mlm_color_e6_cric_trained", id2label = reverse_mapping, label2id = mapping).to(device)
#model = ViltForQuestionAnswering.from_pretrained("model_chkpts/vilt-mlm-classification-model/vilt_mlm_mod_e4_cric_trained/", id2label = reverse_mapping, label2id = mapping).to(device)

In [76]:
from datasets import Dataset

In [77]:
class cric_dataset(Dataset):
    
    def __init__(self, dataset, processor):
        self.processor = processor
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self,idx):
        
        #print(idx)
        item = self.dataset[idx]

        #print(item)
        
        encodings = self.processor(images = item["images"], text = item["questions"], padding="max_length", truncation=True, return_tensors = "pt")
        encodings = {k:v.squeeze() for k,v in encodings.items()}
                                
        encodings['labels'] = torch.tensor(item['scores'], dtype = torch.float32)
        
        return encodings

In [78]:
test_dataset_object = cric_dataset(modified_test_set, processor)

In [79]:
val_dataset_object = cric_dataset(modified_val_set, processor)

In [None]:
# This function returns accuracy on the Test Set
# However, accuracy needs to be found out on the whole Test Set using the saved model chkpts

def calculateAccuracyVal():
    
    matchScore, loopCounter = 0,0
    model.eval()
    for index in tqdm(range(0,1000)):
        
        loopCounter += 1
        
        test_example = val_dataset_object[index]
        test_example = {k: v.unsqueeze(0).to(device) for k,v in test_example.items()}
        test_outputs = model(**test_example)

        test_logits = test_outputs.logits
        test_predicted_classes = torch.sigmoid(test_logits)
        test_ans = reverse_mapping[torch.argmax(test_predicted_classes).item()]
        
        # print(f'T: {answerList_val[index]} <-> P: {test_ans}' )

        # accuracy score
        
        if answerList_val[index] == test_ans:
            matchScore += 1
                
    print(matchScore, loopCounter)
    return ((matchScore/loopCounter)*100)

In [None]:
calculateAccuracyVal()

### Testing on Color Questions

In [80]:
# extracting the list of colors from the previously stored text files

colors = []
with open('./text_files/colors.txt', 'r') as file:
    for color in file:
        color = color.strip()
        colors.append(color)

In [81]:
colors[0:5]

['white', 'red', 'tan', 'brown', 'gray']

In [82]:
# adding leading and trailing space in the colors

colors_spaces = [' '+ color + ' ' for color in colors] 

In [83]:
colors_spaces[0:5]

[' white ', ' red ', ' tan ', ' brown ', ' gray ']

In [84]:
def isContainColor(targetString):
    
    for color in colors_spaces:
        if color in targetString:
            return True
    
    return False    

In [88]:
# This function identifies the color question and finds the accuracy on the test set

misclassifiedIndex = []

def findAccuracyColorQuestions():
    
    global colors
    matchScore, questionCount = 0,0
    model.eval()
    
    print('***** Question About Colors ************')
    
    for index in tqdm(range(0,71860)):
        
        currQuestion = questionList_test[index]        
        
        if ('color' in currQuestion) or (isContainColor(currQuestion)):
            
            questionCount += 1
            
            #print(f'\n{questionList_test[index]} ? Ans: {answerList_test[index]}\n')
            
            example = test_dataset_object[index]
            example = {k: v.unsqueeze(0).to(device) for k,v in example.items()}
            outputs = model(**example)

            logits = outputs.logits
            predicted_classes = torch.sigmoid(logits)
            ans = reverse_mapping[torch.argmax(predicted_classes).item()]

            # accuracy score

            if answerList_test[index] == ans:
                matchScore += 1

            else:      
                misclassifiedIndex.append(index)
                                
        else:
            
            continue
    
    
    print(f'\nTotal {questionCount} questions found')
    print(f'\nCorrectly Classified {matchScore}')
    print(f'\nMistakenly Classified {len(misclassifiedIndex)}')
    
    return ((matchScore/questionCount)*100)

In [None]:
findAccuracyColorQuestions()

***** Question About Colors ************


 51%|████████████████████▉                    | 36668/71860 [06:56<07:36, 77.16it/s]