### Info:
* Not all available images are used (because of storage limitation), so the the dataframes containing the information about the images are adapted
* csv-files are produced with each line containing: filepath,x1,y1,x2,y2,class_name
* based on hte csv-files tfrecord files for training, evaluation and testing are produced

In [1]:
import pandas as pd
import numpy as np
import os
import pathlib
from sklearn.model_selection import train_test_split

In [2]:
from google.colab import drive

drive.mount('/content/gdrive')
root_path = 'gdrive/My Drive/MLforphysicist'
root_path2 = 'gdrive/My\ Drive/MLforphysicist'

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
# Clone the tensorflow models repository if it doesn't already exist
if "models" in pathlib.Path.cwd().parts:
  while "models" in pathlib.Path.cwd().parts:
    os.chdir('..')
elif not pathlib.Path('models').exists():
  !git clone --depth 1 https://github.com/Blue-EyesChaosMaxDragon/models

In [4]:
# Install the Object Detection API
%%bash
cd models/research/
protoc object_detection/protos/*.proto --python_out=.
cp object_detection/packages/tf2/setup.py .
python -m pip install .

Processing /content/models/research
Building wheels for collected packages: object-detection
  Building wheel for object-detection (setup.py): started
  Building wheel for object-detection (setup.py): finished with status 'done'
  Created wheel for object-detection: filename=object_detection-0.1-cp36-none-any.whl size=1533329 sha256=da782e1da462bc997416ad89f5196cb91381ac43e9488a356b3fff8dba125e0a
  Stored in directory: /tmp/pip-ephem-wheel-cache-5cl08l41/wheels/94/49/4b/39b051683087a22ef7e80ec52152a27249d1a644ccf4e442ea
Successfully built object-detection
Installing collected packages: object-detection
  Found existing installation: object-detection 0.1
    Uninstalling object-detection-0.1:
      Successfully uninstalled object-detection-0.1
Successfully installed object-detection-0.1




In [5]:
# HDF5 file path
store_path = "{}/image_dataset/frsign_v1.0.h5".format(root_path)

# Read dataframes stored in HDF5 file
store = pd.HDFStore(store_path, mode='r')
df = store.select('dataframe')
im_df = store.select('images')

In [6]:
# list containing all video folders uploaded to gdrive
folder_list = os.listdir("{}/image_dataset/images".format(root_path))
#print(folder_list)

# extract the folder which contains the image
splitted_path = im_df["fullpath"]
splitted_path = splitted_path.str.split("/", expand=True)
#print(splitted_path)
#print(type(splitted_path))
#print(im_df["fullpath"].iloc[0])

new_im_df = pd.DataFrame()

# appending images to new dataframe, that are in the uploaded folders
for i in folder_list:
  new_im_df = new_im_df.append(im_df[splitted_path[0] == i])

print(im_df)
print(new_im_df) 

                                                         fullpath    x  ...   w    h
sequence image                                                          ...         
83       0      RecFile_1_20181011_153137_pointgrey_flycapture...  882  ...  15   21
         1      RecFile_1_20181011_153137_pointgrey_flycapture...  882  ...  15   21
         2      RecFile_1_20181011_153137_pointgrey_flycapture...  882  ...  15   21
         3      RecFile_1_20181011_153137_pointgrey_flycapture...  882  ...  15   21
         4      RecFile_1_20181011_153137_pointgrey_flycapture...  882  ...  15   21
...                                                           ...  ...  ...  ..  ...
1149     677    RecFile_1_20190124_135313_pointgrey_flycapture...  583  ...  75  177
         678    RecFile_1_20190124_135313_pointgrey_flycapture...  583  ...  75  177
         679    RecFile_1_20190124_135313_pointgrey_flycapture...  583  ...  75  177
         680    RecFile_1_20190124_135313_pointgrey_flycapture...

In [7]:
#appending string ".avi" to match dataframe
folder_list = [x+".avi" for x in folder_list]
#print(df['video_name'].iloc[0])
#print(folder_list)

new_df = pd.DataFrame()

# appending video sequences to new dataframe, that are in the uploaded folders
for i in folder_list:
  new_df = new_df.append(df[df["video_name"] == i])

print(df)
print(new_df)

         CameraInfo_bayerTileFormat  ... image_format
sequence                             ...             
83                             RGGB  ...         PNG8
124                            RGGB  ...         PNG8
128                            RGGB  ...         PNG8
129                            RGGB  ...         PNG8
164                            RGGB  ...         PNG8
...                             ...  ...          ...
3829                           RGGB  ...         PNG8
4132                           RGGB  ...         PNG8
3887                           GBRG  ...         PNG8
2208                           GBRG  ...         PNG8
1149                           RGGB  ...         PNG8

[393 rows x 14 columns]
         CameraInfo_bayerTileFormat  ... image_format
sequence                             ...             
380                            RGGB  ...         PNG8
382                            RGGB  ...         PNG8
352                            RGGB  ...         PNG8
355

In [8]:
# replace target (type) with simpler string
print(new_df["type"].unique())
new_df["type"] = new_df["type"].replace("Chassis A (3 feux verticaux)", "A")
new_df["type"] = new_df["type"].replace("Chassis C (5 feux verticaux)", "C")
new_df["type"] = new_df["type"].replace("Chassis H (6 & 3 feux en S)", "H")
new_df["type"] = new_df["type"].replace("Chassis F (6 & 1 feux en L inverse)", "F")
new_df["type"] = new_df["type"].replace("Chassis R (6 feux dans un disque)", "R")
new_df["type"] = new_df["type"].replace("Chassis ID3 (3 feux horizontaux)", "ID3")
new_df["type"] = new_df["type"].replace("Chassis ID2 (2 feux horizontaux)", "ID2")
print(new_df["type"].unique())

['Chassis A (3 feux verticaux)' 'Chassis ID3 (3 feux horizontaux)'
 'Chassis H (6 & 3 feux en S)' 'Chassis C (5 feux verticaux)'
 'Chassis F (6 & 1 feux en L inverse)' 'Chassis ID2 (2 feux horizontaux)'
 'Chassis R (6 feux dans un disque)']
['A' 'ID3' 'H' 'C' 'F' 'ID2' 'R']


In [9]:
# convey in PASCAL VOC format with normalized xmin, xmax and ymin, ymax
new_im_df["w"] = new_im_df["w"] + new_im_df["x"]
new_im_df["h"] = new_im_df["h"] + new_im_df["y"]


# create a list containing the target variable for each image
sequence = new_im_df.index.get_level_values("sequence")
target = []

for i in sequence:
  target.append(new_df["type"][i])

new_im_df["class"] = target

new_im_df.rename(columns={'fullpath':'filename'}, inplace=True)
new_im_df.rename(columns={'x':'xmin'}, inplace=True)
new_im_df.rename(columns={'y':'ymin'}, inplace=True)
new_im_df.rename(columns={'w':'xmax'}, inplace=True)
new_im_df.rename(columns={'h':'ymax'}, inplace=True)


In [10]:
new_im_df

Unnamed: 0_level_0,Unnamed: 1_level_0,filename,xmin,ymin,xmax,ymax,class
sequence,image,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
380,0,RecFile_1_20181115_092834_pointgrey_flycapture...,854,642,874,668,A
380,1,RecFile_1_20181115_092834_pointgrey_flycapture...,856,641,876,667,A
380,2,RecFile_1_20181115_092834_pointgrey_flycapture...,857,640,877,667,A
380,3,RecFile_1_20181115_092834_pointgrey_flycapture...,859,640,879,667,A
380,4,RecFile_1_20181115_092834_pointgrey_flycapture...,861,639,882,667,A
...,...,...,...,...,...,...,...
2643,208,RecFile_7_20180111_100948_pointgrey_flycapture...,1850,571,1945,684,A
2643,209,RecFile_7_20180111_100948_pointgrey_flycapture...,1856,569,1951,682,A
2643,210,RecFile_7_20180111_100948_pointgrey_flycapture...,1862,567,1957,680,A
2643,211,RecFile_7_20180111_100948_pointgrey_flycapture...,1862,567,1957,680,A


In [11]:
# train, eval, test split
traineval_sample, test_sample = train_test_split(new_im_df.sample(6000), test_size=500, random_state=42, shuffle=True)
train_sample, eval_sample = train_test_split(traineval_sample, test_size=500, random_state=42, shuffle=True)

train_sample.to_csv("{}/image_dataset/train_labels.csv".format(root_path),index=False)
eval_sample.to_csv("{}/image_dataset/eval_labels.csv".format(root_path),index=False)
test_sample.to_csv("{}/image_dataset/test_labels.csv".format(root_path),index=False)

In [12]:
if os.path.isfile("{}/image_dataset/train_labels.csv".format(root_path)):
  print("train CSV file was saved")
else:
  raise ValueError("train CSV file was not saved")
if os.path.isfile("{}/image_dataset/eval_labels.csv".format(root_path)):
  print("eval CSV file was saved")
else:
   raise ValueError("eval CSV file was not saved") 
if os.path.isfile("{}/image_dataset/test_labels.csv".format(root_path)):
  print("test CSV file was saved")
else:
   raise ValueError("test CSV file was not saved")

train CSV file was saved
eval CSV file was saved
test CSV file was saved


In [13]:
generator_file = root_path2+"/MLforphysicist/generate_tfrecord.py"
image_dir_path = root_path2+"/image_dataset/images"
train_label_path = root_path2+"/image_dataset/train_labels.csv"
train_record_path = root_path2+"/data/train.record"

!python {generator_file} --csv_input={train_label_path} --output_path={train_record_path} --image_dir={image_dir_path}

2020-07-20 19:28:57.407853: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
Successfully created the TFRecords: /content/gdrive/My Drive/MLforphysicist/data/train.record


In [14]:
eval_label_path = root_path2+"/image_dataset/eval_labels.csv"
eval_record_path = root_path2+"/data/eval.record"

!python {generator_file} --csv_input={eval_label_path} --output_path={eval_record_path} --image_dir={image_dir_path}

2020-07-20 20:43:55.367489: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
Successfully created the TFRecords: /content/gdrive/My Drive/MLforphysicist/data/eval.record


In [15]:
test_label_path = root_path2+"/image_dataset/test_labels.csv"
test_record_path = root_path2+"/data/test.record"

!python {generator_file} --csv_input={test_label_path} --output_path={test_record_path} --image_dir={image_dir_path}

2020-07-20 20:51:14.301505: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
Successfully created the TFRecords: /content/gdrive/My Drive/MLforphysicist/data/test.record


In [16]:

os.getcwd()

'/content'

In [17]:
# Close store
store.close()