In [1]:
import pandas as pd
import numpy as np
import csv
import os
import glob
import shutil

## Steps
1. Remove the old csv files from `csv` using the provided function `move_to_old_csv`
2. Place the new csv files into `csv`
3. Concatenate all the csv's into a single dataframe `master_df`
4. Perform data cleaning on the `master_df`
5. Write `master_df` to `master.csv`. This  will overwrite the old version.


In [2]:
csv_path=os.getcwd() + os.sep+"csv"
old_csv_path=os.getcwd() + os.sep+"old_csv"
image_path=os.getcwd() + os.sep+"images"
duplicate_path=r"C:\1_USGS\CoastSeg\repos\6_sniffer-classifer\duplicates2"
column_names=["Filename","Sorted"]
# Get master.csv which contains the training dataset 
# master_df=pd.read_csv('master.csv')

## Before You Begin Move old csv files out of csv (Optional Step)
---
- Run this step only if you want to move the old csv files out of `csv`
1. Move all the old files out of the folder `csv` and into folder `old_csv`

In [3]:
def move_to_old_csv(src,dst):
    """Moves all the csv files from src to dst"""
    for file in glob.glob1(csv_path+os.sep,"*csv"):
        print(file)
        file_src=os.path.join(src,file)
        file_dst=os.path.join(dst,file)
        # Overwrites the existing file
        shutil.move(file_src,file_dst)


In [5]:
# move_to_old_csv(csv_path,old_csv_path)

---

## Combine all the CSVs into a single dataframe
1. Read in the csv files as dataframes
2. Append each dataframe to the master dataframe
3. Return the master dataframe

In [6]:
def clean_df(df):
    """Takes in the name of the csv file to clean (ex. 'master.csv') and a list labels to one hot encode.
        Returns the modified dataframe.
    """
    if "index" in df.columns:
        print("Dropping column index")
        df.drop(['index'],axis=1,inplace=True)
    if "Index" in df.columns:
        print("Dropping column Index")
        df.drop(['Index'],axis=1,inplace=True)
    if "Unnamed: 0"in df.columns:
        print("Dropping column Unnamed: 0")
        df.drop(['Unnamed: 0'],axis=1,inplace=True)
    if "Unnamed: 0.1"in df.columns:
        print("Dropping column Unnamed: 0.1")
        df.drop(['Unnamed: 0.1'],axis=1,inplace=True)
    return df

In [7]:
def remove_duplicates(df,image_path:str):
    """Returns a df with removed rows with duplicate Filenames with mismatching "Sorted" values """
    if True in df.duplicated("Filename") :
        # Get names of all the duplicated filenames
        duplicates=df[df.duplicated("Filename",keep=False)]["Filename"]
        unequal_duplicates = [x for x in duplicates if len(set(df[df["Filename"] == x]["Sorted"])) > 1]
        # Keep rows with matching "Sorted" values
        mask = np.logical_not(df["Filename"].isin(unequal_duplicates))
        # Drop duplicated "Filename" rows with matching "Sorted" but keep the first occurrence
        new_df=df[mask].drop_duplicates(subset=["Filename"], keep='first')
        print(new_df)
    return new_df

In [8]:
def get_unequal_duplicates(df,image_path:str, duplicate_path:str):
    """Identifies the duplicated filenames from the df as well as moves the duplicated images from the provided path to the duplicate_path.
    Returns the a dataframe containing the duplicated images"""
    if True in df.duplicated("Filename") :
        # Get names of all the duplicated filenames
        duplicates=df[df.duplicated("Filename",keep=False)]["Filename"]
        unequal_duplicates = [x for x in duplicates if len(set(df[df["Filename"] == x]["Sorted"])) > 1]
        # Moves all the jpgs with duplicate file names and mismatching "Sorted" to the duplicate_path
        for image in set(unequal_duplicates):
            img=image_path+os.sep+image
            if os.path.exists(img):
                print(f"Moving duplicated {img} ")
                shutil.move(img, duplicate_path)
    return unequal_duplicates

In [9]:
def binary_encode_labels(df:'pandas.core.frame.DataFrame',labels=["bad","good"]):
    """ Returns the modified dataframe with the labels encoded as binary labels of 0 and 1"""
    mapping = {}
    df["Filename"]=df["Filename"].astype(str)
    df["Sorted"]=df["Sorted"].astype(str)
    if df["Sorted"].str.contains('|'.join( ["good","bad"])).any():
        for x in range(len(labels)):
            mapping[labels[x]] = x
        # outputs {'bad': 0, 'good': 1}
        # Replace each label in sorted with corresponding one hot encoded label
        for x in range(len(df['Sorted'])):
            df['Sorted'][x] = mapping[df['Sorted'][x]]
        df["Sorted"]=df["Sorted"].astype(str)
    return df

In [10]:
def create_master_csv(csv_path,create_master_csv=False):
    # Get master.csv which contains the training dataset 
    if create_master_csv:
        master_df=pd.DataFrame(columns=["Filename","Sorted"])
    else:
        master_df=pd.read_csv('transfer_master.csv')
    # Gather all the data from all the csv's into append_df
    append_df=pd.DataFrame()
    if os.path.exists(csv_path):
        for file in glob.glob(csv_path+os.sep+"*csv"):
            df=pd.read_csv(file)
            # append_df=append_df.append(df,ignore_index = True)
            append_df=pd.concat([append_df,df],axis=0,ignore_index=True)
        # Append the data from all the csv files to the masterdf
        master_df =pd.concat([master_df,append_df],axis=0,ignore_index=True)
    return master_df

In [11]:
master_df=create_master_csv(csv_path,create_master_csv=True)
master_df=clean_df(master_df)
duplicates=get_unequal_duplicates(master_df,image_path, duplicate_path)
print(duplicates)
master_df

Dropping column Unnamed: 0
Moving duplicated c:\1_USGS\CoastSeg\repos\6_sniffer-classifer\Sniffer-Classifier\images\2017-11-21-18-46-17_L8_rgb_img.jpg 
['2017-11-21-18-46-17_L8_rgb_img.jpg', '2017-11-21-18-46-17_L8_rgb_img.jpg']


Unnamed: 0,Filename,Sorted
0,2016-10-15-15-41-47_L8_rgb.jpg,good
1,2016-10-17-18-46-24_L8_rgb.jpg,good
2,2016-10-22-18-54-48_S2.jpg,good
3,2016-11-01-19-03-46_S2.jpg,good
4,2016-11-02-18-46-25_L8_rgb.jpg,good
...,...,...
2012,2017-11-12-18-52-31_L8_rgb.jpg,bad
2013,2017-11-20-15-37-50_L7_rgb.jpg,good
2014,2017-11-21-18-46-17_L8_rgb_img.jpg,bad
2015,2017-11-28-18-52-24_L8_rgb.jpg,good


In [12]:
master_df=remove_duplicates(master_df,image_path)
master_df

                            Filename Sorted
0     2016-10-15-15-41-47_L8_rgb.jpg   good
1     2016-10-17-18-46-24_L8_rgb.jpg   good
2         2016-10-22-18-54-48_S2.jpg   good
3         2016-11-01-19-03-46_S2.jpg   good
4     2016-11-02-18-46-25_L8_rgb.jpg   good
...                              ...    ...
1960  2016-09-22-18-52-29_L8_rgb.jpg   good
1961      2016-09-22-19-03-44_S2.jpg   good
1962  2016-10-01-18-46-19_L8_rgb.jpg   good
1963      2016-10-02-19-02-32_S2.jpg   good
1964  2016-10-08-18-52-33_L8_rgb.jpg   good

[1964 rows x 2 columns]


Unnamed: 0,Filename,Sorted
0,2016-10-15-15-41-47_L8_rgb.jpg,good
1,2016-10-17-18-46-24_L8_rgb.jpg,good
2,2016-10-22-18-54-48_S2.jpg,good
3,2016-11-01-19-03-46_S2.jpg,good
4,2016-11-02-18-46-25_L8_rgb.jpg,good
...,...,...
1960,2016-09-22-18-52-29_L8_rgb.jpg,good
1961,2016-09-22-19-03-44_S2.jpg,good
1962,2016-10-01-18-46-19_L8_rgb.jpg,good
1963,2016-10-02-19-02-32_S2.jpg,good


In [13]:
# Overwrite the old version of master.csv
master_df.to_csv("new_master.csv",index=False)

## Modify Existing Records in Master CSV
---

The following functions are for the senario where you need to modify an existing dataframe by replacing old rows with new rows.

In [None]:
# old csv is the csv file with the sorted column you want to modify
old_csv="test_dataset.csv"
# new csv is the csv file with the sorted column you want to want to replace old_csv with
new_csv="mod_test_dataset.csv"
# This is the name of csv file that the new dataframe will be saved to
new_csv_file_name="test_dataset.csv"
df=pd.read_csv(old_csv)
records=pd.read_csv(new_csv)

In [None]:
# What happens there are not matching enteries in the left dataframe
def modify_df(df:"pandas.core.frame.DataFrame",new_df:"pandas.core.frame.DataFrame"):
    """Returns a new dataframe with the sorted column values replaced by new_df's sorted values

    Args:
        df (pandas.core.frame.DataFrame): dataframe to have its sort values changed
        new_df (pandas.core.frame.DataFrame):  dataframe with the correct sort values

    Returns:
        pandas.core.frame.DataFrame: new dataframe containing the sort values in new_df
    """
    # Replace the rows df with the sort values in new_df.
    new_df=df.merge(new_df,on="Filename")
    if "index" in new_df.columns:
        print("Dropping column index")
        new_df.drop(['index'],axis=1,inplace=True)
    if "Index" in new_df.columns:
        print("Dropping column Index")
        new_df.drop(['Index'],axis=1,inplace=True)
    if "Unnamed: 0"in new_df.columns:
        print("Dropping column Unnamed: 0")
        new_df.drop(['Unnamed: 0'],axis=1,inplace=True)
    # Merge the sorted_x and sorted_y column into the sorted column. Sorted_y will overwrite sorted_x values
    new_df['Sorted']=new_df.loc[new_df['Sorted_y'].isnull(),'Sorted_y'] = new_df['Sorted_x']
    new_df.drop(['Sorted_y','Sorted_x'],axis=1,inplace=True)
    return new_df


In [None]:
new_df=modify_df(df,records)
new_df.to_csv(new_csv_file_name,index=False)
new_df

## Add the Filenames of all images in a folder to a csv

In [None]:
def create_csv_from_folder(folder_path:str,name:str,sort_type:str):
    """create_csv_from_folder creates a csv file with the provided name with filenames
    of the images in location specified by folder_path
    
    Args:
        folder_path (str): path to the location of the folder containing the images
        name (str): name of the csv file to create
    Returns:
        pandas.core.frame.DataFrame: new dataframe containing the sort values in new_df
    """
    df=pd.DataFrame(columns=["Filename","Sorted"])
    filenames=glob.glob1(folder_path,"*.jpg")
    df["Filename"]=filenames
    df["Sort"]=sort_type
    df.to_csv(name)
    print(f"Saved data to csv file {name}")
    

In [None]:
folder_path=r"C:\Users\Sharon\Downloads\hatteras_bad"
csv_name="hatteras_bad.csv"
sort_type="bad"
create_csv_from_folder(folder_path,csv_name,sort_type)