# <font color="dark">Make Clean data</font>

In [13]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import glob
import cv2
import os
%matplotlib inline

In [14]:
# Global paramter
DATA_train_dir = "../data/train"
DATA_test_dir = "../data/test"
DATA_val_dir = "../data/validation"

## <font color='dark'>Step 1 : Clean garbage data</font>

In [3]:
# train data
train_data_0 = glob.glob(os.path.sep.join([DATA_train_dir, "data_0/*"]))
train_data_1 = glob.glob(os.path.sep.join([DATA_train_dir, "data_1/*"]))
# test data
test_data_0 = glob.glob(os.path.sep.join([DATA_test_dir, "data_0/*"]))
test_data_1 = glob.glob(os.path.sep.join([DATA_test_dir, "data_1/*"]))
# validation data
val_data_0 = glob.glob(os.path.sep.join([DATA_val_dir, "data_0/*"]))
val_data_1 = glob.glob(os.path.sep.join([DATA_val_dir, "data_1/*"]))

print("The length of data_0 for training : %d" % len(train_data_0))
print("The length of data_1 for training : %d" % len(train_data_1))
print("The length of data_0 for test : %d" % len(test_data_0))
print("The length of data_1 for test : %d" % len(test_data_1))
print("The length of data_0 for val : %d" % len(val_data_0))
print("The length of data_1 for val : %d" % len(val_data_1))

The length of data_0 for training : 447
The length of data_1 for training : 3044
The length of data_0 for test : 56
The length of data_1 for test : 380
The length of data_0 for val : 56
The length of data_1 for val : 380


In [4]:
whole_train_data = train_data_0+train_data_1
whole_test_data = test_data_0+test_data_1
whole_val_data = val_data_0+val_data_1

print("whole train data length : %d" % len(whole_train_data))
print("whole test data length : %d" % len(whole_test_data))
print("whole val data length : %d" % len(whole_val_data))

whole train data length : 3491
whole test data length : 436
whole val data length : 436


##  <font color='dark'>Step 2 : See the data distribution</font>

In [5]:
def make_row_dataframe(whole_data):
    width=[]
    height = []
    file_name = []
    label = []

    for id_num,file in enumerate(whole_data):
        # Add the label
        label_val = lambda m,n: 0 if m<n else 1
        label.append(label_val(id_num,len(train_data_0)))

        # Read the image, Add the width and height
        img = cv2.imread(file)
        file_name.append(file)
        width.append(img.shape[0]); height.append(img.shape[1])

    data = pd.DataFrame({"width":width,"height":height,"file":file_name,"label":label})
    return data

train_df = make_row_dataframe(whole_train_data)
test_df = make_row_dataframe(whole_test_data)
val_df = make_row_dataframe(whole_val_data)

In [6]:
train_width_height = train_df.groupby(["width","height"])
print("training data")
train_width_height.size()

training data


width  height
828    1170         1
1288   1936        16
1869   2800        69
2000   2992         3
2304   3456         3
2592   2592      3399
dtype: int64

In [7]:
test_width_height = test_df.groupby(["width","height"])
print("testing data")
test_width_height.size()

testing data


width  height
1288   1936        4
1869   2800       23
2304   3456        2
2592   2592      407
dtype: int64

In [8]:
val_width_height = val_df.groupby(["width","height"])
print("testing data")
val_width_height.size()

testing data


width  height
1288   1936        7
1632   2464        1
1824   2736        1
1869   2800       26
2304   3456        6
2592   2592      395
dtype: int64

In [9]:
def make_clean_dataframe(train_df,data_type):
    # Only image which width is 2592px;
    mask = train_df.width==2592
    clean_data = train_df[mask].drop(columns=["width","height"])
    print("The shape of the clean data for ",data_type,": ",clean_data.shape)
    print("The label destribution of the data for ",data_type,": \n",clean_data.groupby("label").size(),'\n')
    return clean_data
    
train_clean_df = make_clean_dataframe(train_df, "training")
test_clean_df = make_clean_dataframe(test_df, "testing")
val_clean_df = make_clean_dataframe(val_df, "validation")

The shape of the clean data for  training :  (3399, 2)
The label destribution of the data for  training : 
 label
0     435
1    2964
dtype: int64 

The shape of the clean data for  testing :  (407, 2)
The label destribution of the data for  testing : 
 label
0    407
dtype: int64 

The shape of the clean data for  validation :  (395, 2)
The label destribution of the data for  validation : 
 label
0    395
dtype: int64 



## <font color='dark'>Step 3 : Balance the data</font>

In [10]:
# Use train data to split
train_clean_df = train_clean_df.reset_index(drop=True)
# For test
test_clean_df = test_clean_df.append(train_clean_df[-400:]).reset_index(drop=True)
train_clean_df = train_clean_df[:-400]
# For validation
val_clean_df = val_clean_df.append(train_clean_df[-400:]).reset_index(drop=True)
train_clean_df = train_clean_df[:-400]

print("The label destribution of the data for ","train",": \n",train_clean_df.groupby("label").size(),'\n')
print("The label destribution of the data for ","test",": \n",test_clean_df.groupby("label").size(),'\n')
print("The label destribution of the data for ","val",": \n",val_clean_df.groupby("label").size(),'\n')

The label destribution of the data for  train : 
 label
0     435
1    2164
dtype: int64 

The label destribution of the data for  test : 
 label
0    407
1    400
dtype: int64 

The label destribution of the data for  val : 
 label
0    395
1    400
dtype: int64 



## <font color='dark'>Step 4 : Saved as CSV</font>

In [11]:
train_clean_df.to_csv("../data/train_clean_df.csv", index=None)
test_clean_df.to_csv("../data/test_clean_df.csv", index=None)
val_clean_df.to_csv("../data/val_clean_df.csv", index=None)

In [15]:
# Check

train_clean_df=pd.read_csv("../data/train_clean_df.csv")
test_clean_df=pd.read_csv("../data/test_clean_df.csv")
val_clean_df=pd.read_csv("../data/val_clean_df.csv")

print("The label destribution of the data for ","train",": \n",train_clean_df.groupby("label").size(),'\n')
print("The label destribution of the data for ","test",": \n",test_clean_df.groupby("label").size(),'\n')
print("The label destribution of the data for ","val",": \n",val_clean_df.groupby("label").size(),'\n')

The label destribution of the data for  train : 
 label
0     435
1    2164
dtype: int64 

The label destribution of the data for  test : 
 label
0    407
1    400
dtype: int64 

The label destribution of the data for  val : 
 label
0    395
1    400
dtype: int64 

