In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import json
import numpy as np
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
import cv2
import traceback
from PIL import Image
from torchvision import transforms as T



In [3]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [4]:
sid = SentimentIntensityAnalyzer()

data = []
f = open('/content/drive/MyDrive/hateful-memes/train.jsonl','r')
for line in f.readlines():
    jsonobject = json.loads(line)
    nltksen = sid.polarity_scores(jsonobject['text'])
    data.append([jsonobject['id'],jsonobject['text'],jsonobject['img'],jsonobject['label'],
                nltksen['neg'],nltksen['neu'],nltksen['pos'],nltksen['compound']])
    
df = pd.DataFrame(data,columns = ['id','text','img',
                                 'label','nltk1','nltk2','nltk3','nltk4'])
df.to_csv('/content/drive/MyDrive/hateful-memes/csv/train.csv',index = None)

data = []
f = open('/content/drive/MyDrive/hateful-memes/dev_unseen.jsonl','r')
for line in f.readlines():
    jsonobject = json.loads(line)
    nltksen = sid.polarity_scores(jsonobject['text'])
    data.append([jsonobject['id'],jsonobject['text'],jsonobject['img'],jsonobject['label'],
                nltksen['neg'],nltksen['neu'],nltksen['pos'],nltksen['compound']])
    
df = pd.DataFrame(data,columns = ['id','text','img',
                                 'label','nltk1','nltk2','nltk3','nltk4'])
df.to_csv('/content/drive/MyDrive/hateful-memes/csv/dev1.csv',index = None)

data = []
f = open('/content/drive/MyDrive/hateful-memes/dev_unseen.jsonl','r')
for line in f.readlines():
    jsonobject = json.loads(line)
    nltksen = sid.polarity_scores(jsonobject['text'])
    data.append([jsonobject['id'],jsonobject['text'],jsonobject['img'],jsonobject['label'],
                nltksen['neg'],nltksen['neu'],nltksen['pos'],nltksen['compound']])
    
df = pd.DataFrame(data,columns = ['id','text','img',
                                 'label','nltk1','nltk2','nltk3','nltk4'])
df.to_csv('/content/drive/MyDrive/hateful-memes/csv/dev2.csv',index = None)

data = []
f = open('/content/drive/MyDrive/hateful-memes/test_seen.jsonl','r')
for line in f.readlines():
    jsonobject = json.loads(line)
    nltksen = sid.polarity_scores(jsonobject['text'])
    data.append([jsonobject['id'],jsonobject['text'],jsonobject['img'],1,
                nltksen['neg'],nltksen['neu'],nltksen['pos'],nltksen['compound']])
    
df = pd.DataFrame(data,columns = ['id','text','img',
                                 'label','nltk1','nltk2','nltk3','nltk4'])
df.to_csv('/content/drive/MyDrive/hateful-memes/csv/test1.csv',index = None)

data = []
f = open('/content/drive/MyDrive/hateful-memes/test.jsonl','r')
for line in f.readlines():
    jsonobject = json.loads(line)
    nltksen = sid.polarity_scores(jsonobject['text'])
    data.append([jsonobject['id'],jsonobject['text'],jsonobject['img'],1,
                nltksen['neg'],nltksen['neu'],nltksen['pos'],nltksen['compound']])
    
df = pd.DataFrame(data,columns = ['id','text','img',
                                 'label','nltk1','nltk2','nltk3','nltk4'])
df.to_csv('/content/drive/MyDrive/hateful-memes/csv/test2.csv',index = None)

In [5]:
img_size = 256
def resize_to_square(im):
    old_size = im.shape[:2] # old_size is in (height, width) format
    ratio = float(img_size)/max(old_size)
    new_size = tuple([int(x*ratio) for x in old_size])
    # new_size should be in (width, height) format
    im = cv2.resize(im, (new_size[1], new_size[0]))
    delta_w = img_size - new_size[1]
    delta_h = img_size - new_size[0]
    top, bottom = delta_h//2, delta_h-(delta_h//2)
    left, right = delta_w//2, delta_w-(delta_w//2)
    color = [0, 0, 0]
    new_im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT,value=color)
    return new_im

def load_image(path, name):
    image = (Image.open(path + name))
    
    transform1 = T.Compose([
        T.Scale(img_size),
        T.CenterCrop((img_size, img_size)),
    ])
    new_image = transform1(image)
    new_image = np.array(new_image)
    if len(new_image.shape) == 2:
        new_image = np.repeat(new_image.reshape(img_size,img_size,1),3,axis = 2)
    
    if new_image.shape[2] > 3:
        new_image = new_image[:,:,:3]
    return np.array(new_image)

In [6]:
train = pd.read_csv('/content/drive/MyDrive/hateful-memes/csv/train.csv')
train2 = pd.read_csv('/content/drive/MyDrive/hateful-memes/csv/dev1.csv')
test1 = pd.read_csv('/content/drive/MyDrive/hateful-memes/csv/test1.csv')
train3 = pd.read_csv('/content/drive/MyDrive/hateful-memes/csv/dev2.csv')
test2 = pd.read_csv('/content/drive/MyDrive/hateful-memes/csv/test2.csv')
test = test1.append(test2)
train2 = train3.append(train2).drop_duplicates('id',keep='first')

In [7]:
id_pic = {}
id_label = {}
cache = train[['id','text','label']].values
for i in range(cache.shape[0]):
    id_pic[cache[i,0]]  = cache[i,1]
    id_label[cache[i,0]]  = cache[i,2]
    
cache = train2[['id','text','label']].values
for i in range(cache.shape[0]):
    id_pic[cache[i,0]]  = cache[i,1]
    id_label[cache[i,0]]  = cache[i,2]
    
cache = test[['id','text']].values
for i in range(cache.shape[0]):
    id_pic[cache[i,0]]  = cache[i,1]
    
ids_list = list(id_pic.keys())  
isfound = set()
features2 = {}
features = {}
pairs = 0
count1 = 0
count2 = 0
f = open('/content/drive/MyDrive/hateful-memes/hate/hateful/model/text_pairs.csv','w+')
for id,v in id_pic.items():
    key = v.replace('"','').replace("'",'').replace(" ",'')
    features[id] = key
    for id2,v in features.items():
        if id == id2 or str(id) + ' ' + str(id2) in isfound:
            continue
        if v == key:
            print(str(id) + ' ' + str(id2),file = f)
            pairs += 1
            isfound.add(str(id) + ' ' + str(id2))
            isfound.add(str(id2) + ' ' + str(id))
        
print(count1,count2)
print(pairs)
print(len(isfound))
f.close()

0 0
3707
7414


In [8]:
id_pic = {}
id_label = {}
cache = train[['id','img','label']].values
for i in range(cache.shape[0]):
    id_pic[cache[i,0]]  = cache[i,1]
    id_label[cache[i,0]]  = cache[i,2]
    
cache = train2[['id','img','label']].values
for i in range(cache.shape[0]):
    id_pic[cache[i,0]]  = cache[i,1]
    id_label[cache[i,0]]  = cache[i,2]
    
cache = test[['id','img']].values
for i in range(cache.shape[0]):
    id_pic[cache[i,0]]  = cache[i,1]
ids_list = list(id_pic.keys())        
batch_size = 16
features = {}
features2 = {}
n_batches = len(ids_list)//batch_size + 1
pairs = 0
f = open('/content/drive/MyDrive/hateful-memes/hate/hateful/model/img_pairs.csv','w+')
batch_ids = ids_list

isfound3 = set()
for i,id in enumerate(batch_ids):
    if len(batch_ids) == 0:
        continue
    try:
        image = load_image("/content/drive/MyDrive/hateful-memes/", id_pic[id])
        features[id] = image[160:188:2,160:188:2,:].astype(int)
        if len(set(features[id].sum(axis = 2).reshape(-1).tolist())) <= 1:
            continue
        for id2,v in features.items():
            if id == id2 or str(id) + ' ' + str(id2) in isfound3:
                continue
            if abs(np.mean(v - features[id])) < 2.5 and np.std(v - features[id]) < 2.5 and np.std(features[id].sum(axis = 2)) > 5:
                print(str(id) + ' ' + str(id2),file = f)
                isfound3.add(str(id) + ' ' + str(id2))
                isfound3.add(str(id2) + ' ' + str(id))
                pairs += 1
                break
                    
    except:
        print(id,str(traceback.format_exc()))
        
features = {}
print(pairs)
for i,id in enumerate(batch_ids):
    if len(batch_ids) == 0:
        continue
    try:
        image = load_image("/content/drive/MyDrive/hateful-memes/", id_pic[id])
        features[id] = image[175:190:1,150:165:1,:].astype(int)
        if len(set(features[id].sum(axis = 2).reshape(-1).tolist())) <= 1:
            continue
        for id2,v in features.items():
            if id == id2 or str(id) + ' ' + str(id2) in isfound3:
                continue
            if abs(np.mean(v - features[id])) < 2.5 and np.std(v - features[id]) < 2.5 and np.std(features[id].sum(axis = 2)) > 5:
                print(str(id) + ' ' + str(id2),file = f)
                isfound3.add(str(id) + ' ' + str(id2))
                isfound3.add(str(id2) + ' ' + str(id))
                pairs += 1
                break
                    
    except:
        print(id,str(traceback.format_exc()))        

features = {}
print(pairs)
for i,id in enumerate(batch_ids):
    if len(batch_ids) == 0:
        continue
    try:
        image = load_image("/content/drive/MyDrive/hateful-memes/", id_pic[id])
        features[id] = image[100:128:2,100:128:2,:].astype(int)
        if len(set(features[id].sum(axis = 2).reshape(-1).tolist())) <= 1:
            continue
        for id2,v in features.items():
            if id == id2 or str(id) + ' ' + str(id2) in isfound3:
                continue
            if abs(np.mean(v - features[id])) < 2.5 and np.std(v - features[id]) < 2.5 and np.std(features[id].sum(axis = 2)) > 5:
                print(str(id) + ' ' + str(id2),file = f)
                isfound3.add(str(id) + ' ' + str(id2))
                isfound3.add(str(id2) + ' ' + str(id))
                pairs += 1
                break
                    
    except:
        print(id,str(traceback.format_exc())) 
        
features = {}
print(pairs)
for i,id in enumerate(batch_ids):
    if len(batch_ids) == 0:
        continue
    try:
        image = load_image("/content/drive/MyDrive/hateful-memes/", id_pic[id])
        features[id] = image[80:100:2,175:200:2,:].astype(int)
        if len(set(features[id].sum(axis = 2).reshape(-1).tolist())) <= 1:
            continue
        for id2,v in features.items():
            if id == id2 or str(id) + ' ' + str(id2) in isfound3:
                continue
            if abs(np.mean(v - features[id])) < 2.5 and np.std(v - features[id]) < 2.5 and np.std(features[id].sum(axis = 2)) > 5:
                print(str(id) + ' ' + str(id2),file = f)
#                 print(id,id2)
                isfound3.add(str(id) + ' ' + str(id2))
                isfound3.add(str(id2) + ' ' + str(id))
                pairs += 1
                break
                    
    except:
        print(id,str(traceback.format_exc()))           
                
print(pairs)
print(len(isfound))
f.close()



1489
1973
2420
2711
7414


In [11]:
f = open('/content/drive/MyDrive/hateful-memes/hate/hateful/model/same_id.csv','w+')
bad_id = set()
for pair in isfound3:
    if pair in isfound:
        array = sorted([int(x) for x in pair.split(" ")])
        if id_label.get(int(array[0]),-1) != id_label.get(int(array[1]),-1) and id_label.get(int(array[0]),-1) != -1 and id_label.get(int(array[1]),-1) != -1:
            print(array,id_label.get(int(array[0]),-1),id_label.get(int(array[1]),-1))
        else:
            print(str(array[0]) + " " + str(array[1]), file = f)
            bad_id.add(array[1])
print(len(bad_id))                
f.close()

138


In [12]:
print(array,id_label.get(int(array[0]),-1),id_label.get(int(array[1]),-1))

[35097, 36804] 0 0
