In [19]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import re
import gensim
from gensim.models.doc2vec import TaggedDocument
from konlpy.tag import Komoran
import cv2
import timm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms, datasets
from PIL import Image
import PIL
import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2
import torchvision.transforms as transforms
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from PIL import UnidentifiedImageError

In [3]:
data = pd.read_csv('./cleaned_data.csv')

In [4]:
data

Unnamed: 0,clean_title,imgp_path,2_way_label,3_way_label,6_way_label
0,my xbox controller says hi,./images/cypw96.jpg,1,0,0
1,new image from the mandalorian,./images/d0bzlq.jpg,1,0,0
2,say hello to my little friend,./images/d2ezoob.jpg,0,2,4
3,watch your step little one,./images/cjqctpw.jpg,0,2,4
4,this tree i found with a solo cup on it,./images/bq3yuk.jpg,1,0,0
...,...,...,...,...,...
16128,owls in the garden,./images/55o9dn.jpg,0,2,2
16129,this plate with mirrored printing,./images/bjmw3x.jpg,1,0,0
16130,shots fired in munich shopping centre german p...,./images/4u3ro5.jpg,1,0,0
16131,cat with no name,./images/dgvbhsw.jpg,0,2,4


In [42]:
label = data[['2_way_label','3_way_label','6_way_label']]

In [45]:
label = label.drop(not_found,axis=0)

In [46]:
label.to_csv('./label.csv',index=False)

# Text Vector Extraction

In [8]:
tagged_corpus_list_tr = []

for index, row in tqdm(data.iterrows(), total=len(data)):
    text = row['clean_title']
    tagged_corpus_list_tr.append(TaggedDocument(tags=['tweet_'+str(index)], words=pos_tag(word_tokenize(text))))

print('문서의 수 :', len(tagged_corpus_list_tr))

100%|███████████████████████████████████████████████████████████████████████████| 16133/16133 [00:16<00:00, 983.65it/s]

문서의 수 : 16133





In [9]:
model_d2v = gensim.models.Doc2Vec(dm=1, # dm = 1 for ‘distributed memory’ model
                                  dm_mean=1, # dm_mean = 1 for using mean of the context word vectors
                                  vector_size=100, # no. of desired features
                                  window=5, # width of the context window                                  
                                  negative=7, # if > 0 then negative sampling will be used
                                  min_count=3, # Ignores all words with total frequency lower than 5.                                  
                                  workers=32, # no. of cores                                  
                                  alpha=0.1, # learning rate                                  
                                  seed = 1992, # for reproducibility
                                 ) 

In [10]:
model_d2v.build_vocab([i for i in tqdm(tagged_corpus_list_tr)])

model_d2v.train(tagged_corpus_list_tr, total_examples= len(data), epochs=100)

100%|███████████████████████████████████████████████████████████████████████| 16133/16133 [00:00<00:00, 4048262.42it/s]


In [11]:
docvec_arrays = np.zeros((len(data), 100)) 
for i in range(0,len(data)):
    docvec_arrays[i,:] = model_d2v.docvecs[i].reshape((1,100))
doc_vectors = pd.DataFrame(docvec_arrays) 
doc_vectors.shape

  docvec_arrays[i,:] = model_d2v.docvecs[i].reshape((1,100))


(16133, 100)

In [32]:
doc_vectors.loc[not_found]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
390,0.169338,-0.804159,-1.183808,0.493504,-0.466136,0.101025,-0.751572,0.017945,-0.049563,-0.534779,...,-0.441961,0.190417,-0.618389,0.19978,-0.704264,-0.220889,0.668152,0.170674,-0.144629,0.316792
1969,-0.008278,-0.095776,-0.232162,0.107413,0.121722,0.343452,-0.240031,0.212005,-0.132265,-0.103311,...,-0.115178,0.110888,0.10243,-0.160627,-0.553909,0.282147,0.25074,0.251109,0.227181,0.053129
3875,-0.03967,-0.249208,-0.33516,0.240082,-0.048741,0.014932,-0.125371,-0.008539,-0.008167,0.258666,...,-0.31733,0.111297,-0.024402,0.136717,-0.310261,0.307727,0.177254,-0.237123,0.05722,0.197534
4706,0.727731,-1.884009,-0.998834,1.204885,0.570737,1.364665,0.121091,-0.23824,-0.927488,0.495775,...,-0.837781,-1.066733,0.60269,0.752943,-0.748363,0.971337,0.210755,-0.979979,1.240245,0.277094
5258,0.098123,-0.040374,-0.098205,0.130279,-0.480844,-0.069436,-0.213159,-0.220203,0.015354,0.055723,...,-0.098882,0.274266,-0.071504,-0.272566,-0.345751,0.130803,0.298907,0.039116,-0.000299,0.05785
7407,-0.016982,-0.023693,-0.067701,0.076316,0.040418,-0.077901,-0.46855,0.191818,-0.28277,-0.129937,...,-0.294649,0.103181,-0.079723,-0.080642,-0.025865,0.217475,0.17719,0.208205,0.033597,0.185633
11656,-0.62456,0.266598,-0.75369,0.314735,0.139749,-0.535464,-1.003245,0.157175,-0.066106,0.091529,...,-0.352933,0.210839,0.479076,-0.331388,-1.374741,-0.129565,0.365035,0.327523,0.630815,0.48667
11764,-0.289868,-0.413709,-0.522897,0.726441,-0.41188,0.354988,0.203507,1.006371,0.017621,0.565907,...,-0.427107,0.080942,-0.532817,0.144673,-0.439478,0.092508,0.145816,-0.173946,0.115219,0.58498
16099,0.469846,-0.203012,-0.55697,0.528917,-0.313523,0.087793,-0.299316,0.232435,-0.255976,-0.339285,...,-0.649238,0.673175,-0.392579,0.573053,-0.482743,0.711357,0.35221,0.681669,-0.175963,0.356458


In [38]:
doc_vectors=doc_vectors.drop(not_found,axis=0)

In [40]:
doc_vectors.reset_index(drop=True,inplace=True)

In [41]:
doc_vectors.to_csv('text_vectors.csv')

# Image Vector Extraction

In [13]:
CFG = {
    'IMG_SIZE':256,
    'EPOCHS':30,
    'LEARNING_RATE':3e-4,
    'BATCH_SIZE':256,
    'SEED':41
}
# 이미지 사이즈, 에폭, 학습률, 배치사이즈, 시드 고정

In [21]:
cnn_extract = timm.create_model('seresnext26d_32x4d', pretrained=True)
cnn_extract=nn.Sequential(*list(cnn_extract.children())[:-1],
                              nn.Linear(2048,1024),
                              nn.ReLU(),
                              nn.Linear(1024,100),
                              nn.ReLU())

In [22]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
])

In [23]:
data

Unnamed: 0,clean_title,imgp_path,2_way_label,3_way_label,6_way_label
0,my xbox controller says hi,./images/cypw96.jpg,1,0,0
1,new image from the mandalorian,./images/d0bzlq.jpg,1,0,0
2,say hello to my little friend,./images/d2ezoob.jpg,0,2,4
3,watch your step little one,./images/cjqctpw.jpg,0,2,4
4,this tree i found with a solo cup on it,./images/bq3yuk.jpg,1,0,0
...,...,...,...,...,...
16128,owls in the garden,./images/55o9dn.jpg,0,2,2
16129,this plate with mirrored printing,./images/bjmw3x.jpg,1,0,0
16130,shots fired in munich shopping centre german p...,./images/4u3ro5.jpg,1,0,0
16131,cat with no name,./images/dgvbhsw.jpg,0,2,4


In [24]:
imag_fetures=[]
img_path_list= data['imgp_path']
not_found=[]
for ind in tqdm(range(0,len(data))):
    try:
        image = PIL.Image.open(data['imgp_path'][ind]).convert('RGB')
        img = transform(image)
        img = img.reshape(1,3,224,224)
        img_feature = cnn_extract(img)
        img_feature = img_feature[0].detach().numpy()
        imag_fetures.append(img_feature)
    except UnidentifiedImageError:
        pass
        not_found.append(ind)

100%|████████████████████████████████████████████████████████████████████████████| 16133/16133 [24:33<00:00, 10.95it/s]


In [26]:
img_vec = pd.DataFrame(imag_fetures)

In [28]:
img_vec.to_csv('./img_vecs.csv')