In [1]:
import pandas as pd
import numpy as np
import cv2

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import os
import sys
from tqdm import tqdm
import shutil

# Data Preparation

In [27]:
labels = pd.read_pickle('Label/29k_labels.pickle')
print(labels.Condition.value_counts())
labels.sort_values('filename', inplace=True)

# Deviding Data into chunks
normal = labels.loc[(labels.Condition == 'Normal')].sample(6000, random_state=5)
faulty = labels.loc[(labels.Condition == 'Faulty')].sample(6000, random_state=5)
vfaulty = labels.loc[(labels.Condition == 'VFaulty')].sample(6000, random_state=5)
labels = pd.concat([normal,faulty,vfaulty])
labels.sort_values('filename', inplace=True)
labels = shuffle(labels)
print(labels.Condition.value_counts())

Condition
VFaulty    11343
Normal      9209
Faulty      8577
Name: count, dtype: int64
Condition
VFaulty    6000
Normal     6000
Faulty     6000
Name: count, dtype: int64


In [28]:
labels_eval = pd.read_pickle('Label/29k_labels.pickle')
print(labels_eval.Condition.value_counts())
labels_eval.sort_values('filename', inplace=True)

Condition
VFaulty    11343
Normal      9209
Faulty      8577
Name: count, dtype: int64


In [29]:
to_eval = []
for evals in labels_eval.index:
    if evals not in labels.index:
        to_eval.append(evals)

In [30]:
labels_eval = labels_eval.loc[to_eval]

In [31]:
labels_eval.sort_values('filename', inplace=True)
print(labels_eval.Condition.value_counts())
# Deviding Data into chunks
normal = labels_eval.loc[(labels_eval.Condition == 'Normal')].sample(2000, random_state=5)
faulty = labels_eval.loc[(labels_eval.Condition == 'Faulty')].sample(2000, random_state=5)
vfaulty = labels_eval.loc[(labels_eval.Condition == 'VFaulty')].sample(2000, random_state=5)
labels_eval = pd.concat([normal,faulty,vfaulty])
labels_eval.sort_values('filename', inplace=True)
labels_eval = shuffle(labels_eval)
print(labels_eval.Condition.value_counts())

Condition
VFaulty    5343
Normal     3209
Faulty     2577
Name: count, dtype: int64
Condition
Faulty     2000
Normal     2000
VFaulty    2000
Name: count, dtype: int64


In [3]:
y = labels.Condition
train_data, test_data = train_test_split(labels, test_size=0.2, random_state=5, stratify=y)

In [4]:
print(train_data.shape)
print(test_data.shape)
print(train_data.Condition.value_counts())
print(test_data.Condition.value_counts())

(14400, 2)
(3600, 2)
Condition
VFaulty    4800
Faulty     4800
Normal     4800
Name: count, dtype: int64
Condition
Normal     1200
VFaulty    1200
Faulty     1200
Name: count, dtype: int64


In [13]:
# Create a new directory
base_dir = 'Image_Data'
os.mkdir(base_dir)

#CREATE 2 FOLDERS INSIDE THE BASE DIRECTORY

""" train_dir
     -> Faulty
     -> VFaulty
     -> Normal

 test_dir
     -> Faulty
     -> VFaulty
     -> Normal
"""

# train_dir
train_dir = os.path.join(base_dir, 'train_dir')
os.mkdir(train_dir)
# test_dir
test_dir = os.path.join(base_dir, 'test_dir')
os.mkdir(test_dir)

# Inside each folder we create seperate folders for each class
faulty_dir = os.path.join(train_dir, 'faulty')
os.mkdir(faulty_dir)
vfaulty_dir = os.path.join(train_dir, 'vfaulty')
os.mkdir(vfaulty_dir)
normal_dir = os.path.join(train_dir, 'normal')
os.mkdir(normal_dir)

# create new folders inside test_dir
faulty_dir = os.path.join(test_dir, 'faulty')
os.mkdir(faulty_dir)
vfaulty_dir = os.path.join(test_dir, 'vfaulty')
os.mkdir(vfaulty_dir)
normal_dir = os.path.join(test_dir, 'normal')
os.mkdir(normal_dir)

In [33]:
# Evaluation data
""" eval_dir
     -> Faulty
     -> VFaulty
     -> Normal
"""

eval_dir = os.path.join(base_dir, 'eval_dir')
os.mkdir(eval_dir)

faulty_dir = os.path.join(eval_dir, 'faulty')
os.mkdir(faulty_dir)
vfaulty_dir = os.path.join(eval_dir, 'vfaulty')
os.mkdir(vfaulty_dir)
normal_dir = os.path.join(eval_dir, 'normal')
os.mkdir(normal_dir)

In [19]:
# train and test images
train_list = list(train_data.index)
test_list = list(test_data.index)

for image in tqdm(train_list):
    fname = image
    # get the label for a certain image
    target = labels.loc[image,'Condition']
    
    # these must match the folder names
    if target == 'Faulty':
        label = 'Faulty'
    if target == 'VFaulty':
        label = 'VFaulty'
    if target == 'Normal':
        label = 'Normal'
    
    # source path to image
    src = os.path.join('Images_mfcc/Data/', fname)
    # destination path to image
    dst = os.path.join(train_dir, label, fname)
    # copy the image from the source to the destination
    shutil.copyfile(src, dst)

100%|████████████████████████████████████████████████████████████████████████████| 14400/14400 [13:08<00:00, 18.26it/s]


In [20]:
for image in tqdm(test_list):
    fname = image
    # get the label for a certain image
    target = labels.loc[image,'Condition']
    
    # these must match the folder names
    if target == 'Faulty':
        label = 'Faulty'
    if target == 'VFaulty':
        label = 'VFaulty'
    if target == 'Normal':
        label = 'Normal'
    
    # source path to image
    src = os.path.join('Images_mfcc/Data/', fname)
    # destination path to image
    dst = os.path.join(test_dir, label, fname)
    # copy the image from the source to the destination
    shutil.copyfile(src, dst)

100%|██████████████████████████████████████████████████████████████████████████████| 3600/3600 [01:36<00:00, 37.27it/s]


In [36]:
eval_list = list(labels_eval.index)

for image in tqdm(eval_list):
    fname = image
    # get the label for a certain image
    target = labels_eval.loc[image,'Condition']
    
    # these must match the folder names
    if target == 'Faulty':
        label = 'Faulty'
    if target == 'VFaulty':
        label = 'VFaulty'
    if target == 'Normal':
        label = 'Normal'
    
    # source path to image
    src = os.path.join('Images_mfcc/Data/', fname)
    # destination path to image
    dst = os.path.join(eval_dir, label, fname)
    # copy the image from the source to the destination
    shutil.copyfile(src, dst)

100%|██████████████████████████████████████████████████████████████████████████████| 6000/6000 [07:39<00:00, 13.06it/s]


In [22]:
print('Lengths of train')
print(len(os.listdir('Image_Data/train_dir/faulty')))
print(len(os.listdir('Image_Data/train_dir/vfaulty')))
print(len(os.listdir('Image_Data/train_dir/normal')))
print('Lengths of test')
print(len(os.listdir('Image_Data/test_dir/faulty')))
print(len(os.listdir('Image_Data/test_dir/vfaulty')))
print(len(os.listdir('Image_Data/test_dir/normal')))

Lengths of train
4800
4800
4800
Lengths of test
1200
1200
1200


In [37]:
print('Lengths of eval')
print(len(os.listdir('Image_Data/eval_dir/faulty')))
print(len(os.listdir('Image_Data/eval_dir/vfaulty')))
print(len(os.listdir('Image_Data/eval_dir/normal')))

Lengths of eval
2000
2000
2000
