In [1]:
import pandas as pd
import numpy as np
import csv
import os
import glob
import shutil

## Steps
1. Remove the old csv files from `csv` using the provided function `move_to_old_csv`
2. Place the new csv files into `csv`
3. Concatenate all the csv's into a single dataframe `master_df`
4. Perform data cleaning on the `master_df`
5. Write `master_df` to `master.csv`. This  will overwrite the old version.


In [5]:
csv_path=os.getcwd() + os.sep+"csv"
old_csv_path=os.getcwd() + os.sep+"old_csv"
image_path=os.getcwd() + os.sep+"images"
column_names=["Filename","Sorted"]
# Get master.csv which contains the training dataset 
master_df=pd.read_csv('master.csv')

## Before You Begin Move old csv files out of csv (Optional Step)
---
- Run this step only if you want to move the old csv files out of `csv`
1. Move all the old files out of the folder `csv` and into folder `old_csv`

In [3]:
def move_to_old_csv(src,dst):
    """Moves all the csv files from src to dst"""
    for file in glob.glob1(csv_path+os.sep,"*csv"):
        print(file)
        file_src=os.path.join(src,file)
        file_dst=os.path.join(dst,file)
        # Overwrites the existing file
        shutil.move(file_src,file_dst)


In [4]:
move_to_old_csv(csv_path,old_csv_path)

---

## Combine all the CSVs into a single dataframe
1. Read in the csv files as dataframes
2. Append each dataframe to the master dataframe
3. Return the master dataframe

In [6]:
def clean_df(df):
    """Takes in the name of the csv file to clean (ex. 'master.csv') and a list labels to one hot encode.
        Returns the modified dataframe.
    """
    if "index" in df.columns:
        print("Dropping column index")
        df.drop(['index'],axis=1,inplace=True)
    if "Index" in df.columns:
        print("Dropping column Index")
        df.drop(['Index'],axis=1,inplace=True)
    if "Unnamed: 0"in df.columns:
        print("Dropping column Unnamed: 0")
        df.drop(['Unnamed: 0'],axis=1,inplace=True)
    if "Unnamed: 0.1"in df.columns:
        print("Dropping column Unnamed: 0.1")
        df.drop(['Unnamed: 0.1'],axis=1,inplace=True)
    return df

In [7]:
def removed_duplicates(df,image_path:str):
    """Removes the duplicated filenames from the df as well as delete the duplicated images from the provided path.
    Returns the modified dataframe"""
    if True in master_df.duplicated("Filename") :
        print("list of duplicate filenames:")
        print(master_df[master_df.duplicated("Filename")]["Filename"])
        # Get names of all the duplicated filenames
        duplicates=master_df[master_df.duplicated("Filename",keep=False)]["Filename"]
        # Drop the duplicated filenames from the master df
        master_df.drop_duplicates(subset='Filename',keep=False, inplace = True)
        # Delete all the jpgs with duplicate file names
        for image in duplicates:
            img=image_path+os.sep+image
            print(f"{img} ")
            if os.path.exists(img):
                print(f"Removing duplicated {img} ")
                os.remove(img) 
    return master_df 

In [8]:
def create_master_csv(csv_path):
    # Get master.csv which contains the training dataset 
    master_df=pd.read_csv('master.csv')
    # Gather all the data from all the csv's into append_df
    append_df=pd.DataFrame()
    for file in glob.glob(csv_path+os.sep+"*csv"):
        df=pd.read_csv(file)
        append_df=append_df.append(df,ignore_index = True)
    # Append the data from all the csv files to the masterdf
    master_df =pd.concat([master_df,append_df],axis=0,ignore_index=True)
    return master_df

In [11]:
master_df=create_master_csv(csv_path)
master_df=clean_df(master_df)
master_df=removed_duplicates(master_df,image_path)
master_df

Dropping column Index
Dropping column Unnamed: 0
list of duplicate filenames:
Series([], Name: Filename, dtype: object)


Unnamed: 0,Filename,Sorted
0,2000-01-07-18-21-07_L5_rgb.jpg,bad
1,2000-04-12-18-20-29_L5_rgb.jpg,bad
2,2000-04-28-18-21-24_L5_rgb.jpg,good
3,2000-08-02-18-23-18_L5_rgb.jpg,bad
4,2000-08-18-18-23-46_L5_rgb.jpg,good
...,...,...
1511,2018-12-31-16-35-16_S2_ID12022-05-09.jpg,bad
1512,2018-12-31-16-35-16_S2_ID52022-05-09.jpg,bad
1513,2018-12-31-16-35-16_S2_ID32022-05-09.jpg,bad
1514,2018-12-31-16-35-16_S2_ID42022-05-09.jpg,bad


In [12]:
# Overwrite the old version of master.csv
master_df.to_csv("master.csv")