In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils as utils
import torch.utils.data as data
import torchvision.models as models
import torchvision.utils as v_utils
import torchvision.datasets as datasets
import torchvision.transforms as transforms

from sklearn.manifold import TSNE
# from tsnecuda import TSNE
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib.offsetbox import OffsetImage, AnnotationBbox, TextArea
from matplotlib.cbook import get_sample_data
from PIL import ImageFile
import os
from torch.utils.data import DataLoader
import time
# import cupy as cp
from IPython.display import clear_output
import pandas as pd
import gc

In [2]:
torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 2070'

In [3]:
ImageFile.LOAD_TRUNCATED_IMAGES = True
image_size = 256
PATH = "D:CropWebtoons/"

In [None]:
data = datasets.ImageFolder(PATH,transform= transforms.Compose([
        transforms.Resize(image_size),
        transforms.CenterCrop(image_size),
        transforms.ToTensor(),
#         transforms.Normalize((0.5,0.5,0.5),(0.5,0.5,0.5))
        transforms.Grayscale(num_output_channels=3)
        ]))

print(data.class_to_idx)

class_to_idx = data.class_to_idx
idx_to_class = {}
for key, value in enumerate(class_to_idx):
    idx_to_class[key] = value
    
print(idx_to_class)
all_file_idx_to_class = idx_to_class
# img_list = []
# for i in data.imgs:
#     img_list.append(i[0])

#img_list2 = []

#for img in os.listdir('/content/drive/My Drive/dataset/thumnail'):
#    img_list2.append(os.path.join('/content/drive/My Drive/dataset/thumnail',img))
#img_list2.sort()

In [4]:
resnet = models.resnet50(pretrained=True)

class Resnet(nn.Module):
    def __init__(self):
        super(Resnet,self).__init__()
        self.layer0 = nn.Sequential(*list(resnet.children())[0:1])
        self.layer1 = nn.Sequential(*list(resnet.children())[1:4])
        self.layer2 = nn.Sequential(*list(resnet.children())[4:5])
        self.layer3 = nn.Sequential(*list(resnet.children())[5:6])
        #self.layer4 = nn.Sequential(*list(resnet.children())[6:7])
        #self.layer5 = nn.Sequential(*list(resnet.children())[7:8])

    def forward(self,x):
        out_0 = self.layer0(x)
        out_1 = self.layer1(out_0)
        out_2 = self.layer2(out_1)
        out_3 = self.layer3(out_2)
        #out_4 = self.layer4(out_3)
        #out_5 = self.layer5(out_4)

        return out_0, out_1, out_2, out_3, # out_4, out_5

  f"The parameter '{pretrained_param}' is deprecated since 0.13 and may be removed in the future, "


In [5]:
class GramMatrix(nn.Module):
    def forward(self, input):
        b,c,h,w = input.size()
        F = input.view(b, c, h*w)
        G = torch.bmm(F, F.transpose(1,2)) 
        return G

class GramMSELoss(nn.Module):
    def forward(self, input, target):
        out = nn.MSELoss()(GramMatrix()(input), target)
        return out

In [6]:
resnet = Resnet().cuda(0)
for param in resnet.parameters():
    param.requires_grad = False

In [7]:
import pandas as pd

def save_vec(result, label_arr, last_split_data_status):
    image_result_vec_df = pd.DataFrame({'WebToonTitle':[], 'Xmean':[], 'Ymean':[]})
    print(len(label_arr))

    divide_idx = []
    idx = 0
    for i in range(len(label_arr)):
        if((i != 0) & (label_arr[i] != label_arr[i-1])):
            divide_idx.append([idx, i])
            idx = i
    if last_split_data_status:
        divide_idx.append([idx, len(label_arr)])

    print(divide_idx)

    for idx in divide_idx:
        image_result_vec_df = image_result_vec_df.append({'WebToonTitle':[idx_to_class[label_arr[idx[0]]]][0],
                                                          'Xmean':result[idx[0]:idx[1]][0].mean(),
                                                          'Ymean':result[idx[0]:idx[1]][1].mean()},
                                                         ignore_index=True)
    load_image_result_vec_df = pd.read_csv('D:webtoon_vec_mean.csv', encoding='UTF-8', index_col=0)
    load_image_result_vec_df = load_image_result_vec_df.append(image_result_vec_df)
    load_image_result_vec_df.to_csv('D:webtoon_vec_mean.csv', encoding='UTF-8')
# image_result_vec_df.to_csv('D:webtoon_vec_mean.csv', encoding='UTF-8')
# image_result_vec_df

In [8]:
def image_feature_extract(data):
    total_arr = []

    for idx,image in enumerate(data):
        i = image.cuda()
        i = i.view(-1,i.size()[0],i.size()[1],i.size()[2])

        style_target = list(GramMatrix().cuda()(i) for i in resnet(i))

        arr = torch.cat([style_target[0].view(-1),style_target[1].view(-1),style_target[2].view(-1),style_target[3].view(-1)],0)
        gram = arr.cpu().data.numpy().reshape(1,-1)

        total_arr.append(gram.reshape(-1))
        
        if idx % 100 == 0 and idx != 0:
            print(f'{idx} images style feature extracted..[{round(idx / len(data), 2) * 100}%]')
    print('Image style feature extraction done.')
    return total_arr

In [9]:
def get_split_data(data, idx , start_idx):
    data_len = len(data)
    split_image_data = []
    split_label_data = []
    count = 0
    for i in range(start_idx, data_len):
        if count != 5:
            split_label_data.append(data[i][1])
            split_image_data.append(data[i][0].cuda())
        if (count == 5) & (data_len - 1 != i):
            start_idx = i
            break
        if (data_len - 1 == i) | (split_label_data[i-1-start_idx] != split_label_data[i-start_idx]) & (i != start_idx):
            print(f"{count+1}번째 웹툰까지 누적 그림갯수는 {i}개이다")
            count += 1
    print(f"{int((idx+1)/5)}번째 split_data생성 완료")
    return split_image_data, split_label_data, start_idx

In [12]:
torch.cuda.empty_cache()

In [13]:
all_file_len = 552
for file_idx in range(int(all_file_len/30)+1):
    gc.collect()
    torch.cuda.empty_cache()
    if int(all_file_len/30) == file_idx:
        PATH = "D:CropWebtoons"+str(file_idx*30+1)+'-'+str(all_file_len)+'/'
    else:
        PATH = "D:CropWebtoons"+str(file_idx*30+1)+'-'+str((file_idx+1)*30)+'/'
    data = datasets.ImageFolder(PATH,transform= transforms.Compose([
        transforms.Resize(image_size),
        transforms.CenterCrop(image_size),
        transforms.ToTensor(),
#         transforms.Normalize((0.5,0.5,0.5),(0.5,0.5,0.5))
        transforms.Grayscale(num_output_channels=3)
        ]))

    class_to_idx = data.class_to_idx
    idx_to_class = {}
    for key, value in enumerate(class_to_idx):
        idx_to_class[key] = value
    print(idx_to_class)

    start_idx = 0
    titles_len = len(idx_to_class)
    last_split_data_status = False

    for idx in range(4,len(idx_to_class),5):
        start = time.time()
        print(f"완료율:{((0 if idx == 4 else idx+1)/titles_len)*100}%")
        label_arr = []
        split_image_data, label_arr, start_idx = get_split_data(data, idx, start_idx)
        totar_arr = []
        total_arr = image_feature_extract(split_image_data)
        model = TSNE(n_components=2, init='pca',random_state=0, verbose=3, perplexity=100)
        result = model.fit_transform(total_arr)
        if (idx + 1) == len(idx_to_class):
            last_split_data_status = True
        save_vec(result, label_arr, last_split_data_status)
        print("time :", time.time() - start)
        del split_image_data
        gc.collect()
        torch.cuda.empty_cache()
    clear_output(wait=True)
    print(f"{PATH}까지 완료")
print('모든 백터값 추출 완료')

D:CropWebtoons541-552/까지 완료
모든 백터값 추출 완료


In [14]:
mean_vec = pd.read_csv("D:webtoon_vec_mean.csv",encoding='utf-8')
mean_vec

Unnamed: 0.1,Unnamed: 0,WebToonTitle,Xmean,Ymean
0,0,AI 유하,34957.100000,104387.420000
1,1,겨울특강,33205.240000,107172.130000
2,2,기억해줘,-78063.540000,126712.400000
3,3,내일,63022.620000,59304.660000
4,4,노량진 공격대,-28374.607000,107774.200000
...,...,...,...,...
595,0,회귀한 천재 헌터의 슬기로운 청소생활,-100460.500000,176208.562500
596,1,후궁 스캔들,-115110.718750,-78606.601562
597,2,후덜덜덜 남극전자,-62509.250000,-39704.191406
598,3,흑막 여주가 날 새엄마로 만들려고 해,-78394.632812,-35544.019531


In [23]:
mean_vec = mean_vec.loc[52:,['WebToonTitle','Xmean','Ymean']]

In [26]:
mean_vec = pd.read_csv('D:webtoon_vec_mean.csv', encoding='ansi')

In [31]:
mean_vec = mean_vec.iloc[:,1:]

In [32]:
mean_vec.to_csv('D:webtoon_vec_mean.csv', encoding='ansi')

In [None]:
all_file_idx_to_class

In [None]:
print(len(label_arr))
label_arr[3345]

In [None]:
start = time.time()
model = TSNE(n_components=2, init='pca',random_state=0, verbose=3, perplexity=100)
result = model.fit_transform(total_arr)
print("time :", time.time() - start)

In [None]:
def imscatter(x, y, image, ax=None, zoom=1, show_by_thumnail=False, title='webtoon'):
    if ax is None:
        ax = plt.gca()
    try:
        image = plt.imread(image)
    except TypeError:
        # Likely already an array...
        pass
    im = OffsetImage(image, zoom=zoom)

    # Convert inputs to arrays with at least one dimension.
    x, y = np.atleast_1d(x, y)
    
    artists = []
    for x0, y0 in zip(x, y):
        ab = AnnotationBbox(im, (x0, y0), xycoords='data', frameon=False)
        

        if show_by_thumnail:
            offsetbox = TextArea(title, minimumdescent=False)
            ac = AnnotationBbox(offsetbox, (x0, y0),
                        xybox=(20, -40),
                        xycoords='data',
                        boxcoords="offset points")
            artists.append(ax.add_artist(ac))
        artists.append(ax.add_artist(ab))

    ax.update_datalim(np.column_stack([x, y]))
    ax.autoscale()
    return artists

In [None]:
plt.figure(figsize=(20, 12))

for i in range(0,2292):
    img_path = img_list[i]
    plt.scatter(result[i][0], result[i][1], c='red')
for i in range(2292, 3219):
    img_path = img_list[i]
    plt.scatter(result[i][0], result[i][1], c='blue')
# for i in range(3220, 4206):
#     img_path = img_list[i]
#     plt.scatter(result[i][0], result[i][1], c='green')
#     imscatter(result[i,0],result[i,1], image=img_path, zoom=0.05)
plt.show()

In [None]:
plt.figure(figsize=(20, 12))

for i in range(0,2292):
    img_path = img_list[i]
    plt.scatter(result[i][0], result[i][1], c='red')
# for i in range(2292, 3219):
#     img_path = img_list[i]
#     plt.scatter(result[i][0], result[i][1], c='blue')
for i in range(3220, 4206):
    img_path = img_list[i]
    plt.scatter(result[i][0], result[i][1], c='green')
#     imscatter(result[i,0],result[i,1], image=img_path, zoom=0.05)
plt.show()

In [None]:
plt.figure(figsize=(20, 12))

# for i in range(0,2292):
#     img_path = img_list[i]
#     plt.scatter(result[i][0], result[i][1], c='red')
for i in range(2292, 3219):
    img_path = img_list[i]
    plt.scatter(result[i][0], result[i][1], c='blue')
for i in range(3220, 4206):
    img_path = img_list[i]
    plt.scatter(result[i][0], result[i][1], c='green')
#     imscatter(result[i,0],result[i,1], image=img_path, zoom=0.05)
plt.show()

In [None]:
plt.figure(figsize=(20, 12))

for i in range(0,2292):
    img_path = img_list[i]
    plt.scatter(result[i][0], result[i][1], c='red')
for i in range(2292, 3219):
    img_path = img_list[i]
    plt.scatter(result[i][0], result[i][1], c='blue')
# for i in range(3220, 4206):
#     img_path = img_list[i]
#     plt.scatter(result[i][0], result[i][1], c='green')
#     imscatter(result[i,0],result[i,1], image=img_path, zoom=0.05)
plt.show()

In [None]:
len(result)

In [None]:
plt.scatter(result[:2292][0].mean(), result[:2292][1].mean())
plt.scatter(result[2292:3219][0].mean(),result[2292:3219][1].mean())
plt.scatter(result[3219:][0].mean(),result[3219:][1].mean())
plt.show()

In [None]:
avg_list = []
scatter_x = result[:, 0]
scatter_y = result[:, 1]
group = np.array(label_arr)

for g in np.unique(group):
    i = np.where(group==g)
    x_avg = np.mean(scatter_x[i])
    y_avg = np.mean(scatter_y[i])
    avg_list.append((x_avg, y_avg))

In [None]:
plt.figure(figsize=(20, 12))

for i in range(len(avg_list)):
    img_path = img_list2[i]
    imscatter(avg_list[i][0],avg_list[i][1], image=img_path,zoom=0.6, show_by_thumnail=True, title=idx_to_class[i])
plt.show()

In [None]:
import json
import os
import numpy as np

def configInfo(file):
    with open(file, 'r', encoding='utf-8') as f:
        config = json.load(f)
    return config

def imgList(data, config='config.json'):

    thumnail_path = configInfo(config)["path"]["thumnail"]
    img_list, img_list2 = [], []

    for img in data.imgs:
        img_list.append(img[0])

    for img in os.listdir(thumnail_path):
        img_list2.append(os.path.join(thumnail_path,img))
    img_list2.sort()

    return img_list, img_list2

def avgList(result, label_arr):

    avg_list = []
    scatter_x = result[:, 0]
    scatter_y = result[:, 1]
    group = np.array(label_arr)

    for g in np.unique(group):
        i = np.where(group == g)
        x_avg = np.mean(scatter_x[i])
        y_avg = np.mean(scatter_y[i])
        avg_list.append((x_avg, y_avg))

    return 

In [None]:
import os
import numpy as np
import glob
from PIL import Image

Image.MAX_IMAGE_PIXELS = 1000000000


def getCoord(image, height, width, std):

    mid = width // 2
    coor = []
    final = []
    cnt = 0

    for i in range(height):
        if image[i, mid, 0] == 255 and image[i, mid, 1] == 255 and image[i, mid, 2] == 255:
            continue
        if coor:
            if abs(coor[-1] - i) > std:
                final.append((coor[0], coor[-1]))
                cnt += 1
                coor = []
            else:
                coor.append(i)
        else:
            coor.append(i)

        if i == height-1:
            start, end = coor[0], coor[-1]
            final.append((start, end))
            cnt += 1

    return cnt, final


def Croptoon(path, save_dir, std=150):
    hap = 0
    i = 0

    for file in glob.glob(path + '/*'):
        try:
            image = np.asarray(Image.open(file))
            cnt, final = getCoord(image, image.shape[0], image.shape[1], std)
            hap += cnt
            for (start, end) in final:
                cropped = image[start:end, :]
                if cropped.shape[0] < 250:
                    hap -= 1
                    continue
                cropped = Image.fromarray(cropped)
                cropped.save(save_dir + '/' + str(i) + ".jpg")
                i += 1
            print(f'{file} cropped => {cnt} images')
        except:
            continue

    print(f'Total {hap} images cropped')

if __name__ == '__main__':

    webtoon_path = "D:webtoons/"
    cropped_path = "D:CropWebtoons/"

    if 'cropped' not in os.listdir('D:webtoons/'):
        os.mkdir(cropped_path)

    for dir in os.listdir(webtoon_path):
        os.makedirs(os.path.join(os.getcwd(), cropped_path, dir))

    for toon in os.listdir(webtoon_path):
        try:
            Croptoon(webtoon_path + toon, cropped_path + toon)
        except:
            continue

In [None]:
from PIL import Image
import pandas as pd
import re
import os

wt_info = pd.read_csv('D:webtoons코드 파일/네이버 웹툰 정보ansi.csv', encoding="CP949")
wt_name = wt_info['title']

for i in range(len(wt_name)):
    folder_title = re.sub("[-=+,#/\?:^.@*\"※~ㆍ!』‘|\(\)\[\]`\'…》\”\“\’·]",'_',wt_name[i])
    path1 = 'D:webtoons/'+ folder_title +'/1'
    try:
        len_files = len(os.listdir(path1))
        sum_img_height = 0
        for j in range(1, len_files + 1):
            img1 = Image.open(path1+'/'+str(j)+'.jpg')
            sum_img_height = sum_img_height + (img1.size)[1]
        sum_img = Image.new('RGB', ((img1.size)[0], sum_img_height),(255,255,255))
        sum_paste_img_height = 0
        for j in range(1, len_files + 1):
            img1 = Image.open(path1+'/'+str(j)+'.jpg')
            sum_img.paste(img1, (0, sum_paste_img_height))
            sum_paste_img_height=sum_paste_img_height + (img1.size)[1]
        os.mkdir('D:webtoon_sum_episode1/'+folder_title)
        sum_img.save('D:webtoon_sum_episode1/'+folder_title+'/'+wt_name[i]+'.png', 'PNG')
        print(wt_name[i],"완료")
    except:
        continue

In [None]:
import os
import shutil

before_path = 'D:CropWebtoons/'
after1_path = 'D:CropWebtoons50/'
after2_path = 'D:CropWebtoons20/'

# wt_copy_list = ['연애혁명',
#                 '외모지상주의',
#                 '프리드로우',
#                 '소녀의 세계',
#                 '윌유메리미',
#                 '쿠베라',
#                 '호랑이형님',
#                 '윈드브레이커',
#                 '뷰티풀 군바리',
#                 '연놈',
#                 '세라는 망돌',
#                 '하나는 적고 둘은 너무 많아',
#                 '겨울특강',
#                 'AI 유하',
#                 '독거미',
#                 '보고 있지_',
#                 '원수가 나를 유혹할 때',
#                 '산의 시간',
#                 '천년간 노려왔습니다',
#                 '노량진 공격대']


# for dir in os.listdir(before_path):
#     os.makedirs(os.path.join(os.getcwd(), after2_path, dir))

start_i = 0
for i in range(30,len(idx_to_class),30):
#     f_path = 'D:CropWebtoons'+str(i-29)+'-'+str(i)
#     os.mkdir(f_path)
#     for j in range(start_i, i):
#         os.mkdir(f_path+'/'+idx_to_class[j])
#         for k in range(len(os.listdir(before_path+idx_to_class[j]+'/'))):
#             origin_file = before_path+idx_to_class[j]+'/'+str(k)+'.jpg'
#             copy_file = f_path+'/'+idx_to_class[j]+'/'+str(k)+'.jpg'
#             shutil.copy(origin_file, copy_file)
#     start_i += 30
#     print(f_path,'완료')
    if i+30 > len(idx_to_class):
        start_i = int(len(idx_to_class)/30)*30
        f_path = 'D:CropWebtoons'+str(i+1)+'-'+str(len(idx_to_class))
        os.mkdir(f_path)
        for j in range(start_i, len(idx_to_class)):
            os.mkdir(f_path+'/'+idx_to_class[j])
            for k in range(len(os.listdir(before_path+idx_to_class[j]+'/'))):
                origin_file = before_path+idx_to_class[j]+'/'+str(k)+'.jpg'
                copy_file = f_path+'/'+idx_to_class[j]+'/'+str(k)+'.jpg'
                shutil.copy(origin_file, copy_file)
        print(f_path,'완료')
    

# for title in wt_copy_list:
#     os.mkdir(after2_path+title)

# for path in os.listdir(after2_path):
#     for i in range(len(os.listdir(before_path+path))):
#         origin_file = before_path+path+'/'+str(i)+'.jpg'
#         copy_file = after2_path+path+'/'+str(i)+'.jpg'
#         shutil.copy(origin_file, copy_file)
#     print(path,'완료')

In [None]:
os.listdir('D:CropWebtoons/')[0]
int(559/30)

In [None]:
print(len(wt_name))
print(wt_name[560])
# len(os.listdir("D:webtoons/참교육/1/"))

In [None]:
import sys
import pymysql

u='four'
pw='clover'
h='project-db-stu.ddns.net'
p=3307
d='four'

try:
    pymysql.connect(user=u, password=pw, host=h, port=p, database=d)
    print("DB Connetion Success:{0}".format(h))
except pymysql.Error as e:
    print("Error conneting to MySQL Platform : {}".format(e))
    sys.exit(1)

In [None]:
!pip install pymysql