In [1]:
import pandas as pd
import numpy as np
import os
import warnings
from itertools import cycle
import math


In [2]:


## function 1 method that takes as finds and counts unique occurrances of a given column within a csv file
# @param[in] infilename: name of csv file
# @param[in] col: name of column of interest
# @param[in] outFile: exported file showing unique occurances in column "col" as well as how many times they have occured
from numpy import ceil


def exportUniqueValuesInColumn(infilename, col,outFile):   
    df = pd.read_csv(infilename, encoding='latin-1',dtype="string")
    dfgroup = df.groupby(col).size() #df[col].unique()
    dfgroup.to_csv(outFile)
    print ("EXIT SUCCESS")


def exportUniqueValuesInColumnOfPlots(infilename, colSpeciesAll, colPoltsS, infilename_nleve, colPoltsP, outFile_nleve):
    # Read the files
    df_all = pd.read_csv(infilename, encoding='latin-1')
    df_nleve = pd.read_csv(infilename_nleve)
    
    # Merge the two dataframes based on plot ID
    merged_df = pd.merge(df_all, df_nleve[[colPoltsP]], how='inner', left_on=colPoltsS, right_on=colPoltsP)
    
    # Group by species and count occurrences
    species_count = merged_df.groupby(colSpeciesAll).size().reset_index(name='Count')
    
    # Write to output file
    species_count.to_csv(outFile_nleve, index=False)    
    print("File exported in ", outFile_nleve)

## function 1.5, finds if plots in tree csv really exist and then counts how many trees exist for each specie/genus
def groupAndExportCountToCSV(dfinput, col1, col2, plotsFile, plotsIDinPlotsFile, csv_filename): 
    dfplots = pd.read_csv(plotsFile, encoding='latin-1', dtype=str)
    existingPlots = dfplots[plotsIDinPlotsFile].tolist()
    df = pd.read_csv(dfinput, encoding='latin-1', dtype="string")
    df = df[df[col1].isin(existingPlots)] # col1 = colPlots
    grouped = df.groupby([col1, col2])
    aggregated = grouped.size().reset_index(name='Count')
    aggregated.to_csv(csv_filename, index=False)

    aggregated_summed = aggregated.groupby(col2)['Count'].sum().reset_index(name='TotalCount')
    aggregated_summed.to_csv(csv_filename, index=False)


## function 1.7, finds if plots in tree csv really exist and then counts how many trees exist for each specie/genus
def readFilesAndExportCountPerClassToCSV(inDir, outCsv):
    result_df = pd.DataFrame(columns=['Class', 'TotalCount'])

    for file_name in os.listdir(inDir):
        if file_name.endswith('.csv'):
            full_file_path = os.path.join(inDir, file_name)
            current_df = pd.read_csv(full_file_path)
            class_name = file_name.replace('.csv', '')
            line_count = len(current_df) 
            result_df = result_df.append({'Class': class_name, 'LineCount': line_count}, ignore_index=True)
    result_df.to_csv(outCsv, index=False)


## functions 2 finds number of occurances of tree species per plot 
# @param[in] inCSVPlotsFile: name of csv file containing plot data
# @param[in] colPlots: name of column containing the IDs of the plots
# @parma[in] colSpecies: name of column containing the type of species
# @param[in] outFile: name of csv file exported with dominant species per plot
#  param[in] plotsFile the csv file contianing the plot information (used to check if the plots noted in inCsv exist)
#  param[in] plotsIDinPlotsFile the column in plotsFile containing the PlotID information
def exportPlotsWithDominantSpecies(inCsvPlotsFile, colPlots,plotsFile, plotsIDinPlotsFile, colSpecies, outFile):
    #labelsGeneral = ["Pinus","Quercus","Fagus","Eucalyptus","Castanea","Juniperus","Betula","Populus","Fraxinus","Alnus","Abies","Myrica","Arbutus","Acer","Ilex","Salix","Pseudotsuga","Laurus","Crataegus","Sorbus","Corylus","Persea","Acacia","Larix","Prunus","Chamaecyparis","Olea","Tilia","Robinia","Platanus","Cupressus","Picea","Ulmus","Cedrus","Juglans","Phillyrea","Pyrus","Taxus","Otras","Phoenix","Malus","Otros","Sambucus","Tamarix","Celtis","Picconia","Ficus","Apollonias","Heberdenia","Ceratonia","Pinus","Quercus","Fagus","Eucalyptus","Castanea","Juniperus","Betula","Populus","Fraxinus","Alnus","Abies","Myrica","Arbutus","Acer","Ilex","Salix","Pseudotsuga","Laurus","Crataegus","Sorbus","Corylus","Persea","Acacia","Larix","Prunus","Chamaecyparis","Olea","Tilia","Robinia","Platanus","Cupressus","Picea","Ulmus","Cedrus","Juglans","Phillyrea","Pyrus","Taxus","Otras","Phoenix","Malus","Otros","Sambucus","Tamarix","Celtis","Picconia","Ficus","Apollonias","Heberdenia","Ceratonia"]
    
    dfplots = pd.read_csv(plotsFile, encoding='latin-1', dtype=str)
    existingPlots = dfplots[plotsIDinPlotsFile].tolist()
    df = pd.read_csv(inCsvPlotsFile, encoding='latin-1', dtype="string")
    df = df[df[colPlots].isin(existingPlots)]
    df_counts = df.groupby([colPlots, colSpecies]).size().reset_index(name='Count')
    df_pivot = df_counts.pivot(index=colPlots, columns=colSpecies, values='Count').fillna(0)
    #df_pivot = df_pivot[labelsGeneral]
    df_pivot.reset_index(inplace=True)
    #df_pivot = df_pivot[labelsGeneral]
    df_pivot.to_csv(outFile, index=False)
    print("File saved in: ", outFile)


## function 3 takes as input the output of function 2 and adds a column name "sum", which
#  contains the sum of trees per plot and then calculates the percentage of each specie/genera per plot
def getPercentageOfSpeciesPerPlot(inCsv, outCsv):
    df = pd.read_csv(inCsv, encoding='latin-1', low_memory=False)
    labels = list(df.columns)
    print(labels)
    count = 1
    df['sumOfAllTrees'] = df[df.columns[1:]].sum(axis=1)
    """
    species_columns = df.columns[1:]  # Assuming the first column is 'PlotID'
    for species in species_columns:
        percentage_column_name = f'{species}_per'
        df[percentage_column_name] = df[species] / df['sumOfAllTrees']
    
    df.to_csv(outCsv, index=False)
    """
    df_copy = df.copy()
    species_columns = df_copy.columns[1:]
    for species in species_columns:
        df_copy[species] = df[species] / df_copy['sumOfAllTrees']

       
    df_copy.to_csv(outCsv, index=False)

    print("   *** getPercentageOfSpeciesPerPlot: exit success!   ***")


## function 4 takes as input the output of function 3 and creates a folder for each class
#  within the folder it stores three files: (1) all the plots that contain only this class,
#  (2) all the plots that contain 75% and more than the specified class but less than 100%
#  (3) all the plots that contain 50-75% of this class
#  param[in] inCsv the output of Function 3 containing the how many of each tree class exist in each plot in percentages
#  param[in] colPlots the column in inCsv defining the Plot ID 
#  param[in] outDir the directory where the new folders will be stored
def getPlotIDsPerClass(inCsv, colPlots, outDir):
    df = pd.read_csv(inCsv, encoding='latin-1', low_memory=False)
    if 'sumOfAllTrees' in df.columns:
        df = df.drop(columns=['sumOfAllTrees'])
    
    df.set_index(colPlots, inplace=True)
    for species_column in df.columns:
        filtered_df100 = df[(df[species_column] >= 0.99999999999) ]
        filtered_df75  = df[(df[species_column] >= 0.75         ) ] #& (df[species_column] < 0.99999999999)]
        filtered_df50  = df[(df[species_column] >= 0.5          ) ] #& (df[species_column] < 0.75         )]
        
        selected_plots100 = filtered_df100.index.tolist()
        selected_plots75  = filtered_df75.index.tolist()
        selected_plots50  = filtered_df50.index.tolist()

        outDir75  = os.path.join(outDir.rstrip(os.path.sep), "100" )
        outDir50  = os.path.join(outDir.rstrip(os.path.sep), "50" )
        outDir100  = os.path.join(outDir.rstrip(os.path.sep), "100")
        if not os.path.exists(outDir75):
            os.makedirs(outDir75)
        if not os.path.exists(outDir50):
            os.makedirs(outDir50)
        if not os.path.exists(outDir100):
            os.makedirs(outDir100)
        
        
        selected_plots_df = pd.DataFrame({species_column: selected_plots75})
        output_file75 =os.path.join(outDir75, f"{species_column}.csv")
        selected_plots_df.to_csv(output_file75, index=False)

        selected_plots_df = pd.DataFrame({species_column: selected_plots50})
        output_file50 =os.path.join(outDir50, f"{species_column}.csv")
        selected_plots_df.to_csv(output_file50, index=False)

        selected_plots_df = pd.DataFrame({species_column: selected_plots100})
        output_file100 =os.path.join(outDir100, f"{species_column}.csv")
        selected_plots_df.to_csv(output_file100, index=False)

        print("files exported in directories: \n", outDir50, "\n", outDir75, "\n", outDir100)


## Function 5 used to merge Quercus - takes as input multiple csv files and merges them renaming the first column (label) 
# - should work with multiple columns, just give us input the new labels in comma separated
def mergeCsvFiles(directory_path, output_file, new_column_label):
    files = [file for file in os.listdir(directory_path) if file.endswith('.csv')]
    with open(output_file, 'w') as combined_file:
        combined_file.write(new_column_label+"\n")
        for file_name in files:
            file_path = os.path.join(directory_path, file_name)         
            with open(file_path, 'r') as file:
                lines = file.readlines()[1:]
                combined_file.writelines(lines)
    print("New File saved in ", output_file)
    


## Function 6, takes as input a dirctory and reads all the csv files within that directory 
# and counts how many columns exist in those file minus one (for the label)
def exportCsvSummary(directoryPath, outputFile):
    files = [file for file in os.listdir(directoryPath) if file.endswith('.csv')]
    file_names = []
    row_counts = []
    for file_name in files:
        file_path = os.path.join(directoryPath, file_name)
        file_names.append(os.path.splitext(file_name)[0])
        with open(file_path, 'r') as file:
            lines = file.readlines()[1:]
            row_count = len(lines)
            row_counts.append(row_count)

    data = {'File Name': file_names, 'Row Count': row_counts}
    df = pd.DataFrame(data)
    df.to_csv(outputFile, index=False)


## Function 7 takes asn input two csv files (1) the list of the plots per genus/species
# (2) the csv with the features e.g., NDVI time-series with the plotDID data
# exports a subset of the 2nd file, only the rows whose PlotID is listed in the first file
def filter_rows_by_plot_ids(directoryPath, second_file_path, colPlotID, outFolder):
    if directoryPath == outFolder:
        raise ValueError("Input and Outpuf files must be different!")
    # else
        # input and output files are different so files will not be overwritten

    files = [file for file in os.listdir(directoryPath) if file.endswith('.csv')]
    # PlotIDs with features 
    data_df = pd.read_csv(second_file_path,dtype=str)

    if not os.path.exists(outFolder):
            os.makedirs(outFolder)

    for file_name in files:
        full_file_path = os.path.join(directoryPath, file_name)
        with open(full_file_path, 'r') as file:
            lines = [line.strip() for line in file.readlines()[1:]]
            flat_lines = [plot_id for line in lines for plot_id in line.split(",")]
        filtered_data_df = data_df[data_df[colPlotID].isin(lines)]      
        new_file_path = os.path.join(outFolder, file_name)
        filtered_data_df.to_csv(new_file_path, index=False)
    print("Files saved in ", outFolder)


## function 8 selects samples for training 
#  @param[in] inDir takes as input a directory with the .csv files containing the NDVI time-series and the plotIDs - each file has the name of the class(e.g. genus) +.csv 
#  @param[in] classList is a list of the classes of interested
#  @param[in] noOfSamples number of random samples to select per class of interest
#  @param[in] outCsv exports one csv file that contains all the randomly selected samples with their features (e.g., NDVI time-series) and the column "class" is store the class (e.g., genus) each row belongs to 
def selectSamples(inDir,classList,noOfSamples,outCsv):
    files = [file for file in os.listdir(inDir) if file.endswith('.csv')]
    csvClassListCsv = [class_name + '.csv' for class_name in classList]
    combined_df = pd.DataFrame()
    for file_name in files:
        if file_name in csvClassListCsv:
            full_file_path = os.path.join(inDir, file_name)
            current_df = pd.read_csv(full_file_path)
            current_df['class'] = file_name.replace('.csv', '')
            if current_df['class'].iloc[0] not in classList:
                warnings.warn(f"Warning: Class {current_df['class'].iloc[0]} not found in classList.")
            print(file_name)
            sampled_df = current_df.sample(n=noOfSamples, replace=False, random_state=42)  
            if len(sampled_df) < noOfSamples:
                warnings.warn(f"Warning: Class {current_df['class'].iloc[0]} has less than {noOfSamples} rows.")

            combined_df = pd.concat([combined_df, sampled_df], ignore_index=True)
    combined_df.to_csv(outCsv, index=False)
    print("File exported in ", outCsv)





## function 9 adds co-ordinates to the csv files according to the plotFile
def addCoordinates(inFile, plotsFile, plotsIDinPlotsFile, CX, CY, outfile):
    in_df = pd.read_csv(inFile)
    plots_df = pd.read_csv(plotsFile)

    if CX not in plots_df.columns or CY not in plots_df.columns:
        raise ValueError(f"Columns {CX} and {CY} not found in {plotsFile}")

    in_df[CX] = None
    in_df[CY] = None

    for index, row in in_df.iterrows():
        plot_id = row[plotsIDinPlotsFile]
        match_row = plots_df[plots_df[plotsIDinPlotsFile] == plot_id]

        if not match_row.empty:
            in_df.at[index, CX] = match_row[CX].iloc[0]
            in_df.at[index, CY] = match_row[CY].iloc[0]

    in_df.to_csv(outfile, index=False)    

    print ("New file exported in ", outfile)

## function 10 takes as input a file and creates multiple subfiles according to a given class
def divideToSubfiles(inFile, classCol, outDir):
    df = pd.read_csv(inFile)

    unique_classes = df[classCol].unique()

    if not os.path.exists(outDir):
        os.makedirs(outDir)

    for class_value in unique_classes:
        subset_df = df[df[classCol] == class_value]

        out_file = os.path.join(outDir, f"{class_value}.csv")
        subset_df.to_csv(out_file, index=False)
    print("Subfiles exported in ", outDir)


## function 11 randomly divides my test data into three datasets for cross validation 
def divideToThreeForCrossValidation(inCsv, outDir):
    df = pd.read_csv(inCsv)
    unique_classes = df['class'].unique()
    if not os.path.exists(outDir):
        os.makedirs(outDir)

    df1 = pd.DataFrame(columns=df.columns)
    df2 = pd.DataFrame(columns=df.columns)
    df3 = pd.DataFrame(columns=df.columns)

    for unique_class in unique_classes:
        class_df = df[df['class'] == unique_class]

        for index, row in class_df.iterrows():
            count = index  
            if count % 3 == 0:
                df1 = df1.append(row)
            elif count % 3 == 1:
                df2 = df2.append(row)
            else:
                df3 = df3.append(row)

    df1.to_csv(os.path.join(outDir, 'file_1.csv'), index=False)
    df2.to_csv(os.path.join(outDir, 'file_2.csv'), index=False)
    df3.to_csv(os.path.join(outDir, 'file_3.csv'), index=False)


## function 12, calculates KNN for the NDVI time-series
def classifyKNN (train1, train2, test1, test1_results, k):
    df1 = pd.read_csv(train1)
    df2 = pd.read_csv(train2)
    trainingData = pd.concat([df1, df2], ignore_index=True)
    testingData = pd.read_csv(test1)

    labels_of_interest = ["0_NDVI", "1_NDVI", "2_NDVI", "3_NDVI", "4_NDVI", "5_NDVI", "6_NDVI", "7_NDVI", "8_NDVI", "9_NDVI", "10_NDVI", "11_NDVI"]
    class_column = "class"



    for test_index, test_row in testingData.iterrows():
        testRow = test_row[labels_of_interest].tolist()
        tmp_df = pd.DataFrame(columns=[class_column, "distance"])

        for train_index, train_row in trainingData.iterrows():
            trainRow = train_row[labels_of_interest].tolist()
            countNonNull = 0.0
            distance = 0.0

            for i in range(1, len(trainRow)):
                if pd.notna(trainRow[i]) and pd.notna(testRow[i]):
                    countNonNull += 1.0
                    distance += math.sqrt((trainRow[i] - testRow[i])**2)

            if countNonNull > 0.1:
                distance /= countNonNull

            tmp_df = pd.concat([tmp_df, pd.DataFrame({class_column: [train_row[class_column]], "distance": [distance]})], ignore_index=True)
            
        tmp_df = tmp_df.sort_values(by="distance", ascending=True)

        tmp_df = tmp_df.head(k)
        
        for index, row in tmp_df.iterrows():
            if row["distance"] <= 0.000000000001:
                tmp_df.at[index, "distance"] = 10000
            else:
                tmp_df.at[index, "distance"] = 1.0 / row["distance"]

        unique_classes = tmp_df[class_column].unique()

        summed_distances_df = pd.DataFrame(columns=["class", "summedDistance"])

        for unique_label in unique_classes:
            subset_df = tmp_df[tmp_df[class_column] == unique_label]
            sum_distances = subset_df["distance"].sum()
            summed_distances_df = pd.concat([summed_distances_df, pd.DataFrame({"class": [unique_label], "summedDistance": [sum_distances]})], ignore_index=True)
        
        max_distance_class = summed_distances_df.loc[summed_distances_df["summedDistance"].idxmax()]["class"]
        testingData.at[test_index, "Results"] = max_distance_class

    testingData.to_csv(test1_results, index=False)
    
    #return distances_df

    
    
    print("Classified results are given in ", test1_results)




#### FOREST TYPES #######
## method that saves a csv file for each uinique label in column column_name
def group_rows_by_label(input_csv, column_name,outDir):
    if not os.path.exists(outDir):
        os.makedirs(outDir)
    df = pd.read_csv(input_csv)
    unique_labels = df[column_name].unique()
    for label in unique_labels:
        label_df = df[df[column_name] == label]
        output_filename =os.path.join(outDir+f"{label}.csv")
        print(output_filename)
        label_df.to_csv(output_filename, index=False)
    print("files saved in ", outDir)





def calculate_and_save_metrics(input_file, class_column, results_column, output_file):
    # Read the CSV file into a pandas DataFrame
    df = pd.read_csv(input_file)

    unique_classes = df[class_column].unique()
    metrics = []

    for class_name in unique_classes:
        tp = ((df[class_column] == class_name) & (df[results_column] == class_name)).sum()
        tn = ((df[class_column] != class_name) & (df[results_column] != class_name)).sum()
        fp = ((df[class_column] != class_name) & (df[results_column] == class_name)).sum()
        fn = ((df[class_column] == class_name) & (df[results_column] != class_name)).sum()

        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        metrics.append({
            'Class': class_name,
            'TP': tp,
            'TN': tn,
            'FP': fp,
            'FN': fn,
            'Precision': precision,
            'Recall': recall,
            'F1 Score': f1_score
        })

    # Create DataFrame from metrics
    metrics_df = pd.DataFrame(metrics)

    print (metrics_df)
    # Save metrics to CSV
    metrics_df.to_csv(output_file, index=False)
    print ("File saved in ", output_file)
