In [None]:
# Importing necessary libraries
import os
import pandas as pd

FULL_DATASET_PATH = '/data_volume/dataset/images/'
EXISTING_DATASET_PATH = '/data_volume/dataset/sample_classifier_images/'

In [169]:
def generate_full_dataset_df(path):
    '''
    Generates dataframe with make. model, file name and original path of the training dataset
    '''
    df = pd.DataFrame(columns=['make', 'model', 'file_name', 'original_path'])
    full_data = []
    for root, dirs, files in os.walk(path):        
        for file in files:
            row = {}          
            row['make']=((root.split('/')[-1]).split('_')[0])
            row['model']=((root.split('/')[-1]).split('_')[1])
            row['file_name']=(file)
            row['original_path']=(os.path.join(root, file))
            full_data.append(row)
    df= pd.DataFrame.from_dict(full_data)
    return df

In [170]:
# Passing the function to the images
full_dataset_df = generate_full_dataset_df(FULL_DATASET_PATH)

In [171]:
# Showing the data created
full_dataset_df.head()

Unnamed: 0,make,model,file_name,original_path
0,abarth,124-spider,892_11_d68b8307-2005-415e-9b5a-1e0029736fee_51...,/data_volume/dataset/images/abarth_124-spider_...
1,abarth,124-spider,889_2_a278a24b-3424-cf46-e053-e250040a4900_bf2...,/data_volume/dataset/images/abarth_124-spider_...
2,abarth,124-spider,890_5_997aaade-0ccc-49d5-8193-199e294c9a5c_b52...,/data_volume/dataset/images/abarth_124-spider_...
3,abarth,124-spider,889_10_a278a24b-3424-cf46-e053-e250040a4900_d5...,/data_volume/dataset/images/abarth_124-spider_...
4,abarth,124-spider,890_2_997aaade-0ccc-49d5-8193-199e294c9a5c_c11...,/data_volume/dataset/images/abarth_124-spider_...


In [173]:
# Downloading pickled images
pickles_df = pd.read_csv("pickled_images.csv", index_col = 0)

In [174]:
# Showing the data created
pickles_df.head()

Unnamed: 0_level_0,pickle_path,make,model,original_path
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,/data_volume/dataset/pickled_images/abarth/124...,abarth,124-spider,/data_volume/dataset/sample_dataset_images/aba...
2,/data_volume/dataset/pickled_images/abarth/124...,abarth,124-spider,/data_volume/dataset/sample_dataset_images/aba...
3,/data_volume/dataset/pickled_images/abarth/124...,abarth,124-spider,/data_volume/dataset/sample_dataset_images/aba...
4,/data_volume/dataset/pickled_images/abarth/124...,abarth,124-spider,/data_volume/dataset/sample_dataset_images/aba...
5,/data_volume/dataset/pickled_images/abarth/124...,abarth,124-spider,/data_volume/dataset/sample_dataset_images/aba...


In [175]:
# Creating a new column with the file name
pickles_df['file_name'] = pickles_df['original_path'].str.split("/").str.get(-1)
pickles_df.head()

Unnamed: 0_level_0,pickle_path,make,model,original_path,file_name
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,/data_volume/dataset/pickled_images/abarth/124...,abarth,124-spider,/data_volume/dataset/sample_dataset_images/aba...,851_4_c65c63c0-9318-4c26-9410-479799f0bab9_dcd...
2,/data_volume/dataset/pickled_images/abarth/124...,abarth,124-spider,/data_volume/dataset/sample_dataset_images/aba...,888_15_a34f742d-359b-4244-977c-135d8a35282a_44...
3,/data_volume/dataset/pickled_images/abarth/124...,abarth,124-spider,/data_volume/dataset/sample_dataset_images/aba...,905_1_7d71c74b-ed3d-c603-e053-e250040a6889_63f...
4,/data_volume/dataset/pickled_images/abarth/124...,abarth,124-spider,/data_volume/dataset/sample_dataset_images/aba...,874_9_c0ada72f-870f-4f1b-8344-63e8e4b3adaf_360...
5,/data_volume/dataset/pickled_images/abarth/124...,abarth,124-spider,/data_volume/dataset/sample_dataset_images/aba...,874_7_c0ada72f-870f-4f1b-8344-63e8e4b3adaf_08e...


In [176]:
# Merging the datasets
unprocessed_images_df = full_dataset_df.merge(pickles_df, how='left', left_on='file_name', right_on='file_name', 
                sort=False, suffixes=('_x', '_y'), copy=True, indicator=False, validate=None)

In [177]:
# Showing the data created
unprocessed_images_df.head()

Unnamed: 0,make_x,model_x,file_name,original_path_x,pickle_path,make_y,model_y,original_path_y
0,abarth,124-spider,892_11_d68b8307-2005-415e-9b5a-1e0029736fee_51...,/data_volume/dataset/images/abarth_124-spider_...,,,,
1,abarth,124-spider,889_2_a278a24b-3424-cf46-e053-e250040a4900_bf2...,/data_volume/dataset/images/abarth_124-spider_...,,,,
2,abarth,124-spider,890_5_997aaade-0ccc-49d5-8193-199e294c9a5c_b52...,/data_volume/dataset/images/abarth_124-spider_...,,,,
3,abarth,124-spider,889_10_a278a24b-3424-cf46-e053-e250040a4900_d5...,/data_volume/dataset/images/abarth_124-spider_...,,,,
4,abarth,124-spider,890_2_997aaade-0ccc-49d5-8193-199e294c9a5c_c11...,/data_volume/dataset/images/abarth_124-spider_...,,,,


In [178]:
# Checking hte amount of NaNs of unprocessed_images_df
unprocessed_images_df.pickle_path.isna().sum()

1405130

In [179]:
# Creating a dataframe of images that still have not been used for training or testing
unprocessed_images_df2= unprocessed_images_df[unprocessed_images_df['pickle_path'].isnull()]
len(unprocessed_images_df2)

1405130

In [180]:
# Showing the data created
unprocessed_images_df2.head()

Unnamed: 0,make_x,model_x,file_name,original_path_x,pickle_path,make_y,model_y,original_path_y
0,abarth,124-spider,892_11_d68b8307-2005-415e-9b5a-1e0029736fee_51...,/data_volume/dataset/images/abarth_124-spider_...,,,,
1,abarth,124-spider,889_2_a278a24b-3424-cf46-e053-e250040a4900_bf2...,/data_volume/dataset/images/abarth_124-spider_...,,,,
2,abarth,124-spider,890_5_997aaade-0ccc-49d5-8193-199e294c9a5c_b52...,/data_volume/dataset/images/abarth_124-spider_...,,,,
3,abarth,124-spider,889_10_a278a24b-3424-cf46-e053-e250040a4900_d5...,/data_volume/dataset/images/abarth_124-spider_...,,,,
4,abarth,124-spider,890_2_997aaade-0ccc-49d5-8193-199e294c9a5c_c11...,/data_volume/dataset/images/abarth_124-spider_...,,,,


In [183]:
# Dropping columns and renaming other remaining to clean the dataset
unprocessed_images_df2.drop(columns=['model_y','make_y', 'original_path_y'], inplace = True)
unprocessed_images_df2.rename(columns={'model_x':'model', 'original_path_x':'original_path', 'make_x':'make'}, inplace = True)
unprocessed_images_df2.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unprocessed_images_df2.rename(columns={'model_x':'model', 'original_path_x':'original_path', 'make_x':'make'}, inplace = True)


Unnamed: 0,make,model,file_name,original_path,pickle_path
0,abarth,124-spider,892_11_d68b8307-2005-415e-9b5a-1e0029736fee_51...,/data_volume/dataset/images/abarth_124-spider_...,
1,abarth,124-spider,889_2_a278a24b-3424-cf46-e053-e250040a4900_bf2...,/data_volume/dataset/images/abarth_124-spider_...,
2,abarth,124-spider,890_5_997aaade-0ccc-49d5-8193-199e294c9a5c_b52...,/data_volume/dataset/images/abarth_124-spider_...,
3,abarth,124-spider,889_10_a278a24b-3424-cf46-e053-e250040a4900_d5...,/data_volume/dataset/images/abarth_124-spider_...,
4,abarth,124-spider,890_2_997aaade-0ccc-49d5-8193-199e294c9a5c_c11...,/data_volume/dataset/images/abarth_124-spider_...,


In [184]:
# Saving the new dataframe to a .csv file
unprocessed_images_df2.to_csv('unprocessed_files.csv')