In [19]:
import warnings
warnings.filterwarnings("ignore")
import os
import shutil
from utils import train_test_names
import re

from PIL import Image

import pandas as pd
import numpy as np
pd.options.display.float_format = '{:.4f}'.format

In [28]:
hyped_info_df = pd.read_csv(os.path.join('..','data','hyped_info_df.csv'),index_col=0)
non_hyped_info_df = pd.read_csv(os.path.join('..','data','non_hyped_info_df.csv'))

### Train test split for hyped

In [33]:
# spliting train and and test for hyped shoes. Grouped by similar names
hyped_info_df = hyped_info_df.set_index('name')
hypetrain, hypetest = train_test_names(hyped_info_df.index.tolist(),test_size=0.1,seed=24)
hyped_info_df['hypetrain'] = 1
for t in hypetest:
    hyped_info_df.loc[t,'hypetrain'] = 0

In [36]:
# sanity check
hyped_info_df.hypetrain.sum() / hyped_info_df.shape[0]

0.9004462753175421

In [30]:
# easier split for nonhyped
n_test = int(non_hyped_info_df.shape[0]*0.1)
non_hyped_names = non_hyped_info_df.index.tolist()
np.random.shuffle(non_hyped_names)
train_non_hyped_names = non_hyped_names[n_test:]
test_non_hyped_names = non_hyped_names[:n_test]

non_hyped_info_df['hypetrain'] = 1
for t in test_non_hyped_names:
    non_hyped_info_df.loc[t,'hypetrain'] = 0
non_hyped_info_df.loc['special','hypetrain'] = 0 # I want this image to be in test set 
non_hyped_info_df = non_hyped_info_df.reset_index()

### Train test split for Brands

In [35]:
hyped_info_df[~hyped_info_df.brand.isin(['nike','jordan','adidas'])].shape[0]

218

In [37]:
main_brands = ['nike','jordan','adidas']
hyped_info_df['classification_brands'] = hyped_info_df['brand'].apply(lambda brand: 'other' if brand not in main_brands else brand)

In [47]:
classification_brands = hyped_info_df.classification_brands.unique()
hyped_info_df['brandtrain'] = 1
for brand in classification_brands:
    brand_sub = hyped_info_df[hyped_info_df.classification_brands==brand]
    brandtrain, brandtest = train_test_names(brand_sub.index)
    for t in brandtest:
        hyped_info_df.loc[t,'brandtrain'] = 0
hyped_info_df = hyped_info_df.reset_index()

In [49]:
hyped_info_df.brandtrain.sum() / hyped_info_df.shape[0]

0.9004462753175421

### Make new directories and copy files

In [46]:
# save new meta df
# hyped_info_df.to_csv(os.path.join(os.getcwd(),'..','data','hyped_info_df.csv'),index=False)
# non_hyped_info_df.to_csv(os.path.join(os.getcwd(),'..','data','non_hyped_info_df.csv'),index=False)

In [12]:
def train_test_image_copy(row,train_path,test_path,task):
    image = Image.open(row['path'])
    ext = re.search(r'(.jpg|.png)',row['path']).group()
    if row[task]==1:
        image.save(os.path.join(train_path,str(row['name'])+str(row['source'])+str(row.name)+str(ext)))
    else:
        image.save(os.path.join(test_path,str(row['name'])+str(row['source'])+str(row.name)+str(ext)))
    return

In [157]:
train_test_folders = ['brand_data','hype_data']
for folder in train_test_folders:
    path = os.path.join(os.getcwd(),'..','data',folder)
    if os.path.exists(path):
        shutil.rmtree(path)
    os.makedirs(path)
hype_path = os.path.join(os.getcwd(),'..','data','hype_data')
brand_path = os.path.join(os.getcwd(),'..','data','brand_data')

In [158]:
for brand in hyped_info_df.classification_brands.unique():
    train_path = os.path.join(brand_path,'train',brand)
    test_path = os.path.join(brand_path,'test',brand)
    for path in [train_path,test_path]:
        if os.path.exists(path):
            shutil.rmtree(path)
        os.makedirs(path)
    subset_df = hyped_info_df[hyped_info_df.classification_brands==brand]
    subset_df.apply(lambda row: train_test_image_copy(row,train_path,test_path,'brandtrain'),axis=1)

In [48]:
for ishype in ['hype','non_hype']:
    train_path = os.path.join(hype_path,'train',ishype)
    test_path = os.path.join(hype_path,'test',ishype)
    for path in [train_path,test_path]:
        if os.path.exists(path):
            shutil.rmtree(path)
        os.makedirs(path)
    if ishype == 'hype':
        hyped_info_df.apply(lambda row: train_test_image_copy(row,train_path,test_path,'hypetrain'),axis=1)
    else:
        non_hyped_info_df.apply(lambda row: train_test_image_copy(row,train_path,test_path,'hypetrain'),axis=1)