In [1]:
# for loading/processing the images  
from keras.preprocessing.image import load_img 
from keras.preprocessing.image import img_to_array 
from keras.applications.vgg16 import preprocess_input 

# models 
from keras.applications.vgg16 import VGG16 
from keras.models import Model

# clustering and dimension reduction
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# for everything else
import os
import numpy as np
import matplotlib.pyplot as plt
from random import randint
import pandas as pd
import pickle

In [3]:
df = pd.read_csv("./ogclass.csv")
df = df.drop(columns=["Unnamed: 0"])
df

Unnamed: 0,name,color
0,0,yellow
1,1,pink
2,2,gray
3,3,orange
4,4,white
...,...,...
9995,9995,blue
9996,9996,gray
9997,9997,white
9998,9998,gray


In [5]:
min_samples = df['color'].value_counts().min()
min_samples

952

In [6]:
from sklearn.utils import resample

balanced_df = pd.concat([resample(df[df['color'] == cls], replace=False, n_samples=min_samples) for cls in df['color'].unique()])

balanced_df


Unnamed: 0,name,color
2401,2401,yellow
8423,8423,yellow
9731,9731,yellow
5732,5732,yellow
8894,8894,yellow
...,...,...
3215,3215,purple
5216,5216,purple
5634,5634,purple
8823,8823,purple


In [7]:
grouped = balanced_df.groupby('color')
sampled_df = grouped.apply(lambda x: x.sample(frac=0.1))
sampled_df = sampled_df.reset_index(drop=True)
sampled_df

Unnamed: 0,name,color
0,4120,black
1,2447,black
2,6213,black
3,4005,black
4,3112,black
...,...,...
945,187,yellow
946,9374,yellow
947,1333,yellow
948,613,yellow


In [9]:
for fname in sampled_df.name.to_list():
    # print(f"mv ./data/{fname}.png ./stdtestset/")
    os.system(f"mv ./data/{fname}.png ./stdtestset/")

In [11]:
balanced_minus_sampled = pd.merge(balanced_df, sampled_df, how='outer', indicator=True).loc[lambda x: x['_merge'] == 'left_only'].drop('_merge', axis=1)
balanced_minus_sampled

Unnamed: 0,name,color
0,2401,yellow
1,8423,yellow
3,5732,yellow
4,8894,yellow
5,896,yellow
...,...,...
9515,3215,purple
9516,5216,purple
9517,5634,purple
9518,8823,purple


In [12]:
grouped_for_init = balanced_minus_sampled.groupby('color')
init_df = grouped_for_init.apply(lambda x: x.sample(frac=0.1))
init_df = init_df.reset_index(drop=True)
init_df

Unnamed: 0,name,color
0,1242,black
1,2008,black
2,7483,black
3,6631,black
4,8363,black
...,...,...
855,568,yellow
856,6604,yellow
857,8491,yellow
858,649,yellow


In [13]:
for fname in init_df.name.to_list():
    os.system(f"mv ./data/{fname}.png ./init_population/")

In [14]:
! mkdir ./segregated_files

In [16]:
df.to_csv("./segregated_files/df.csv")
# min_samples.to_csv("./segregated_files/min_samples.csv")
balanced_df.to_csv("./segregated_files/balanced_df.csv")
sampled_df.to_csv("./segregated_files/sampled_df.csv")
balanced_minus_sampled.to_csv("./segregated_files/balanced_minus_sampled.csv")
init_df.to_csv("./segregated_files/init_df.csv")