## Drive Mount

In [None]:

from google.colab import drive
drive.mount('/content/drive')

In [None]:
cd "/content/drive/MyDrive/2023_CJOliveNetworks_Winter_Internship/fashion_dataset"

## Importing

In [None]:
import numpy as np
import pandas as pd
import os
import re
import tensorflow as tf
from threading import Thread
import time
from tqdm import tqdm
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.offline import init_notebook_mode
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import Sequence
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, GlobalAveragePooling2D, Activation, Dropout, Flatten, Dense, Input, Layer
from tensorflow.keras.applications import VGG16, ResNet50, DenseNet201, Xception
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import plot_model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

from urllib import request 
from PIL import Image
from io import BytesIO
import requests
from google.colab.patches import cv2_imshow
import cv2
import matplotlib.pyplot as plt

## 데이터 불러오기

In [None]:
data = pd.read_csv("./fashion_dataset.csv")

In [None]:
data = data[['ID', 'TITLE', 'CATEGORY', 'TAG', 'LINK']]
data

In [None]:
fname = []
for i in range(len(data)):
    cat = data.iloc[i]['CATEGORY']
    num = data.iloc[i]['ID']
    num = num[3:]
    fname.append('/content/drive/MyDrive/2023_CJOliveNetworks_Winter_Internship/fashion_dataset/' + cat + '/' + cat + '_' + str(num) + '.jpg')

fname
data['filename'] = fname

In [None]:
data.head()

In [None]:
data.iloc[0]['LINK']

### 배경 제거하기

In [None]:
# !pip install rembg

In [None]:
from rembg.bg import remove
import io

In [None]:
def show_rembg(i_path):
    fig = plt.figure(figsize = (10, 10))

    fig.add_subplot(1,2,1)
    orig_img = Image.open(i_path)
    plt.imshow(orig_img)

    output = remove(orig_img)
    fig.add_subplot(1,2,2)
    plt.imshow(output)
    output.save(i_path)


In [None]:
input_path = './americancasual/americancasual_001.jpg'
show_rembg(input_path)

### 주요 컬러 색상 메타데이터 추가하기

In [None]:
# ! pip install colorthief

In [None]:
from colorthief import ColorThief

In [None]:
main_color = []

In [None]:
pwd

In [None]:
from rembg.bg import remove

## EDA

### checking for Null Values

In [None]:
data.isnull().sum()

- 굉장히 깔끔한 dataFrame이다;;ㅎㅎㅎ 

### Visualizations

- Main Categories Count
    - 카테고리별로 데이터 개수 알아보기

In [None]:
fig = px.bar(data.groupby('CATEGORY').count().reset_index(), x='CATEGORY', y='ID', title='Count for fashion Category')
fig.update_layout(barmode='stack', xaxis={'categoryorder' : 'total descending'})

## Train-Val Split

In [None]:
data = data.sample(frac=1).reset_index(drop=True)
n = len(data)
train = data.iloc[:int(n*0.8), :]
val = data.iloc[int(n*0.8):,:].reset_index(drop=True)

In [None]:
print(len(train))
print(len(val))

In [None]:
fig = px.bar(train.groupby('CATEGORY').count().reset_index(), x='CATEGORY', y='ID', title='Count for fashion Category of Train Set')
fig.update_layout(barmode='stack', xaxis={'categoryorder' : 'total descending'})

In [None]:
fig = px.bar(val.groupby('CATEGORY').count().reset_index(), x='CATEGORY', y='ID', title='Count for fashion Category of Validation Set')
fig.update_layout(barmode='stack', xaxis={'categoryorder' : 'total descending'})

## Data Generator

In [None]:
data['filename'][0]

In [None]:
data_filenames = os.listdir("/content/drive/MyDrive/2023_CJOliveNetworks_Winter_Internship/fashion_dataset")
data_dir = '/content/drive/MyDrive/2023_CJOliveNetworks_Winter_Internship/fashion_dataset'

In [None]:
datagen = ImageDataGenerator(rescale=1./255)

In [None]:
train_generator = datagen.flow_from_dataframe(dataframe=train,
                                              directory=data_dir,
                                              target_size=(256,256),
                                              x_col='filename',
                                              y_col='CATEGORY',
                                              class_mode='categorical',
                                              batch_size=32,
                                              shuffle=False)

val_generator = datagen.flow_from_dataframe(dataframe=val,
                                            target_size=(256,256),
                                            directory=data_dir,
                                            x_col='filename',
                                            y_col='CATEGORY',  
                                            class_mode='categorical',
                                            batch_size=32,
                                            shuffle=False)

## 특징 추출 : VGG16

In [None]:
base_model = VGG16(include_top=False, input_shape=(256, 256, 3))

model = Sequential()
for layer in base_model.layers:
    model.add(layer)
model.add(GlobalAveragePooling2D())
model.summary()

### Train / Valid set 특징 추출

In [None]:
train_features = model.predict(train_generator, verbose=1)
val_features = model.predict(val_generator, verbose=1)

## 주성분 분석

In [None]:
data.head() 

In [None]:
# from sklearn.decomposition import pca

In [None]:
pca = PCA(2)
pca.fit(train_features)
train_pca = pca.transform(train_features)

In [None]:
test_pca = pca.fit_transform(val_features)

In [None]:
train_pca = pd.DataFrame(train_pca)
train = train.iloc[:,0:10]
train = train.merge(train_pca, how='left', left_index=True, right_index=True)

In [None]:
fig = px.scatter(train, x=0, y=1, color="CATEGORY", title='CATEGORY', height=600, labels={
                     "0": "Principal Component 1",
                     "1": "Principal Component 2"})
fig.show()