In [None]:
import myServices as ms
import models as md
import os
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,roc_curve, auc, roc_auc_score, f1_score
import joblib
import umap

In [None]:
# to compute ececution time do: 
# with timeit():
#     # your code, e.g., 
class timeit(): 
    from datetime import datetime
    def __enter__(self):
        self.tic = self.datetime.now()
    def __exit__(self, *args, **kwargs):
        print('runtime: {}'.format(self.datetime.now() - self.tic))

## Importing and manipulating datasets

In [None]:
sklearn.metrics.get_scorer_names()

## Reading hydra logs

In [None]:

def isKeyInLog(log, key:str=''):
    with open(log, 'r') as f:
        for l in f:
            spl = l.split('-')
            for text in spl:
                if (text == key or text == ' '+key+' '):
                    f.close()
                    return True
    f.close()
    return False

def getKeyContentFromLog(log, key:str=''):
    keyInLog = isKeyInLog(log,key)
    if keyInLog:
        with open(log, 'r') as f:
            lines = f.readlines()
            lines = [line.strip() for line in lines]
            for l in lines:
                spl = l.split(' - ')
                for i in range(len(spl)):
                    if (spl[i] == key or spl[i]  == ' '+key+' '):
                        f.close()
                        return spl[i+1]
    
    else:
        return ''

def scrapeLog(log,keyList:[]):
    outDict = {}
    for key in keyList:
        outDict[key] = getKeyContentFromLog(log,key)
    return outDict    

def add_row_to_excel(dictionary, filename):
    '''
    This function use df.to_excel() to add a line to an existing Excel file. If the Excel file is missing, or damaged, the function will create a new one in the place. 
    '''
    # Open the Excel file
    try:
        df = pd.read_excel(filename)
    except FileNotFoundError:
        # If the file does not exist, create an empty DataFrame
        df = pd.DataFrame()

    # Convert the dictionary to a DataFrame
    new_row = pd.DataFrame([dictionary])

    # Append the new row to the DataFrame
    df = pd.concat([df, new_row], ignore_index=True)

    # Write the DataFrame back to the Excel file
    df.to_excel(filename, index=False)


In [None]:
keyList = ['model','model name','Epochs','Batch Size','metric','Test metric','Data Set']
folderToScrape= r'C:\Users\abfernan\CrossCanFloodMapping\FloodProbabRNCanAbd\multirun'

subStr = 'executeModels.log'
excellPath = os.path.join(folderToScrape,'logSummaryMultirun.xlsx') 
listOfLosgs = ms.listALLFilesInDirByExt_fullPath(folderToScrape,'.log')

for log in listOfLosgs:
    ditToWrite = scrapeLog(log,keyList)
    add_row_to_excel(ditToWrite,excellPath)



### Data Analyse

In [None]:
class1_Tr = r'C:\Users\abfernan\CrossCanFloodMapping\FloodMappingProjData\HRDTMByAOI\A_DatasetsForMLP\RegionalModelingExplorationDatasets\TrainingDataset_RastComb_class5.csv'
# class1_Val = r'C:\Users\abfernan\CrossCanFloodMapping\FloodMappingProjData\HRDTMByAOI\A_DatasetsForMLP\RegionalModelingExplorationDatasets\ValidationSet_RastComb_Normalized_class1.csv'

class1_tr_DS =  pd.read_csv(class1_Tr,index_col=None)
# class1_Val_DS =  pd.read_csv(class1_Val,index_col=None)

# reader = pd.concat([class1_tr_DS,class1_Val_DS])#reader = class1_tr_DS  # 
descriptor = class1_tr_DS.describe()

print(descriptor)

# dfDescriptorXclx = r'C:\Users\abfernan\CrossCanFloodMapping\FloodProbabRNCanAbd\outputs\Descriptor_class5_Full_Normalized.xlsx'

# descriptor.to_excel(dfDescriptorXclx, index=True)


In [None]:
max = 2238.
min = 1.0
# reader['Elev'] = (class1_tr_DS['Cilp']-min)/(max-min)

reader['Elev'] = class1_tr_DS['Cilp']

listeNames = ['Elev','GMorph','FloodOrd','Slope','d8fllowAcc','HAND','proximity','Labels']

reader = reader[listeNames]
reader.describe()

In [None]:
####. Covariance Matrix
fig, ax = plt.subplots(figsize=(15, 10))
ax.font_scale = 9
# sns.set(font_scale=1.5)
matrix = reader.corr().round(2)
sns.heatmap(matrix, annot=True, linewidth=1)

In [None]:
### Pairplot ###
sns.set(font_scale=1.5)
sns.pairplot(reader, hue = 'Labels', diag_kind = 'kde', 
             plot_kws = {'alpha': 0.8, 's': 100},
             height = 4, corner=True, palette = "Set2")# vars = ['life_exp', 'log_pop', 'log_gdp_per_cap'],

# sns.pairplot(DS, hue="percentage")

In [None]:
# listeNames = ['Elev','RelElev','GMorph','FloodOrd','Slope','d8fllowAcc','HAND','proximity','Labels', 'Aoi_Id']


plt.figure(figsize=(10, 6))
sns.scatterplot(data=reader, x='Elev', y='RelElev', hue='Labels', palette='Set2')

In [None]:
class1_Tr = r'C:\Users\abfernan\CrossCanFloodMapping\FloodMappingProjData\HRDTMByAOI\A_DatasetsForMLP\StratifiedSampling\class5_Full.csv'
reader =  pd.read_csv(class1_Tr,index_col=None)
count = reader.shape[0]
print(count)
positives = reader[reader['Labels'] == 1].shape[0]
posPercent = positives/count

negatives = reader[reader['Labels'] == 0].shape[0]
print(count)
negPercent = negatives/count

print(positives, posPercent)
print(negatives, negPercent)


In [None]:
def draw_umap(data, n_neighbors=15, min_dist=0.1, n_components=2, metric='euclidean', title=''):
    fit = umap.UMAP(
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        n_components=n_components,
        metric=metric
    )
    u = fit.fit_transform(data)
    fig = plt.figure()
    if n_components == 1:
        ax = fig.add_subplot(111)
        ax.scatter(u[:,0], range(len(u)), c=data)
    if n_components == 2:
        ax = fig.add_subplot(111)
        ax.scatter(u[:,0], u[:,1], c=data)
    if n_components == 3:
        ax = fig.add_subplot(111, projection='3d')
        ax.scatter(u[:,0], u[:,1], u[:,2], c=data, s=100)
    plt.title(title, fontsize=18)

In [None]:
# dataNames = ['RelElev','GMorph','FloodOrd','Slope','d8fllowAcc','HAND','proximity','Labels','Aoi_Id']

data = np.array(reader[['RelElev','GMorph','Slope','HAND','proximity']])
contains_nan = np.isnan(data).any()
print('data contains_nan? :', contains_nan)

labels = np.array(reader['Labels'])

embedding = umap.UMAP(n_neighbors = 15,
                     metric='euclidean',
                     min_dist=0.99,
                     n_components=2,
                     ).fit_transform(data,y=labels)

In [None]:
dataset = np.empty((embedding.shape[0],4))
dataset[:,:2]= embedding
dataset[:,2]= np.array(reader['Labels'])
dataset[:,3]= np.array(reader['Aoi_Id'])
dataFrame = pd.DataFrame(dataset, columns=['x','y','Labels','Aoi_Id'])

In [None]:

positives = dataFrame[dataFrame['Labels'] == 0]

print(positives.shape)

In [None]:

classes = reader['Aoi_Id']
classesUnique = np.unique(classes)
print(classesUnique)

fig, ax = plt.subplots(1, figsize=(10, 8))
plt.scatter(dataFrame['x'],dataFrame['y'], s=0.1, c=dataFrame['Aoi_Id'], cmap='Spectral')
plt.setp(ax, xticks=[], yticks=[])
cbar = plt.colorbar(boundaries=np.arange(41)-0.5)
cbar.set_ticks(np.arange(40))
# cbar.set_ticklabels(classesUnique)
plt.title('Testing UMAP in flood modeling \n n_neighbors=15, metric=euclidean, min_dist=0.99 \n Class5 \n zones = [ 1  2  3  4  5  6  7 10 13 22 23 24 25 26 28 29 30]')# n_neighbors=5, metric=euclidean, min_dist=0.2

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(positives['x'], positives['y'], positives['z'], c=positives['Aoi_Id'], s=1)

In [None]:
### Import DataSet 
readSetPath = 'datasets/datasets4MLP/'
importName = 'MLP_basin5_Test.csv'
saveDatasetPath = 'datasets/datasets4MLP_Binary/'
basinDataSet = pd.read_csv((readSetPath+importName), index_col = None)

basinDataSet = ms.makeBinary(basinDataSet,'percentage',0,1)

exportName = 'MLPBinary_basin5_Test.csv'
basinDataSet.to_csv((saveDatasetPath+exportName), index=None)

basinDataSet.describe()
basinDataSet.head()

In [None]:
colNames = ['percentage','DLSOL5R200', 'DLSOL4R150', 'DLSOL5R150']
for col in colNames: 
    basinDataSet[col].fillna(0,inplace=True)

In [None]:
## Transform a column datatype
repalcer  = basinDataSet['percentage'].to_numpy().astype('int32')
print(repalcer[0:10],repalcer.dtype)
basinDataSet.loc[:,'percentage'] = repalcer

In [None]:
### Make binary Dataset ###
# keep class_0 and replace with 1 all other classes. 
basinDataSet = ms.makeBinary(basinDataSet,'percentage',0,1)


In [None]:
basinDataSet.describe()

In [None]:
basinDataSet.head()

In [None]:
basinDataSet.isna().any()

In [None]:
exportName = 'MLPBinary_basin3_Training.csv'
basinDataSet.to_csv((saveDatasetPath+exportName), index=None)

In [None]:
### Replacing QGIS NoData value(-9999) with 0 
repalcer  = basinDataSet['FAProx_01'].to_numpy()
basinDataSet['FAProx_01'] = [0 if repalcer[j] == -9999 else repalcer[j] for j in range(len(repalcer))]                                                                                                                         
                                                                                                                          

In [None]:
basinDataSet.dropna(subset=['slope'],inplace=True)

In [None]:
basinDataSet.drop(['fid'], axis =1, inplace=True)

In [None]:
basinDataSet.isna().sum()

In [None]:
#### NOrmalize Flow Accumulation
basinDataSet['FAcc'] = (basinDataSet['FAcc']- basinDataSet['FAcc'].min())/(basinDataSet['FAcc'].max()-basinDataSet['FAcc'].min())


In [None]:
ds = DS.head(5)
s = {}
s['Datas'] = ds
print(s)

## Proportional Spliting 

In [None]:
## Stratified Split
from sklearn.model_selection import StratifiedShuffleSplit

X,Y = ms.importDataSet('datasets/basin1_FirstFeatureSet_Clean.csv', 'percentage')
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=50)
for train_index, test_index in sss.split(X, Y):
    print("TRAIN:", train_index.size, "TEST:", test_index.size)
    X_train = X.iloc[train_index]
    y_train = Y.iloc[train_index]
    X_test = X.iloc[test_index]
    y_test = Y.iloc[test_index]

In [None]:
## Describing training set
print(len(X_train['elevation']), len(y_train) )
trainCount = Counter(y_train)
print(trainCount)

In [None]:
#####    Creating training set     #####
X_train.loc[:,'percentage'] = y_train
X_train.head()


In [None]:
## Removing coordinates from training set
X_train.drop(['x_coord','y_coord'], axis =1, inplace=True)
X_train.head()

In [None]:
X_train.to_csv('datasets/basin1_FirstFeatureSet_Clean_Training.csv', index=None)

In [None]:
#####. Creating Test set
print(X_test.head())
X_test.loc[:,'percentage'] = y_test
print(X_test.head())
print(X_test.info())
testCount = Counter(X_test['percentage'])
print(f"testCount:  {testCount}")


In [None]:
X_test.to_csv('datasets/basin1_FirstFeatureSet_Clean_Test.csv', index=None)

In [None]:
## This proportions are the reason why a sample_weight of 0.01 for the majority class give best results for regression
totalTrain = sum([trainCount[0], trainCount[1], trainCount[5]]) 
totalValidation = sum([testCount[0], testCount[1], testCount[5]])
print(f"total Train samples: {totalTrain},  total Validation samples: {totalValidation}")
print("Summary of traning and test dataset class balance")
print(f"Training Set:", '\n', "Class 0: %.3f" %(trainCount[0]/totalTrain), " Class 1: %.4f" %(trainCount[1]/totalTrain), "Class 5: %.4f"%(trainCount[5]/totalTrain))
print("Testing Set:", '\n', "Class 0: %.3f" %(testCount[0]/totalValidation)," Class 1: %.4f" %(testCount[1]/totalValidation),  "Class 5: %.4f"%(testCount[5]/totalValidation))



In [None]:
model = ms.loadModel('./outputs/2022-08-05/00-35-58/2208050035.pkl')
dataSetToSave = ms.makePredictionToImportAsSHP(csvName, model, X, Y, 'percentage')
print(dataSetToSave.head())

## Redistribute dataset for individual basin modeling into Regional Modeling

In [6]:
# Read full Dataset. The full dataset contains a las column with basins ID, also there is a file of basinsId-BasinsName relation. 
datsetPath = r'C:\Users\abfernan\CrossCanFloodMapping\FloodMappingProjData\HRDTMByAOI\A_DatasetsForMLP\StratifiedSampling_RasterizeCombined\class5_Full.csv'
fullDataset = pd.read_csv(datsetPath, index_col=None)
print(fullDataset.info())
uniques = np.unique(np.array(fullDataset['Aoi_Id']))
print(uniques)

basinIDNamesListPath = r'C:\Users\abfernan\CrossCanFloodMapping\FloodMappingProjData\HRDTMByAOI\A_DatasetsForMLP\StratifiedSampling_RasterizeCombined\Aoi_ID_Name_List.csv'
basinIDNamesList = ms.createListFromCSV(basinIDNamesListPath)

outFolder = r'C:\Users\abfernan\CrossCanFloodMapping\FloodMappingProjData\HRDTMByAOI\A_DatasetsForMLP\RMA_IndividualBasins\RasterCombined\Class_5'
outDatasetList=[]
outDatasetList_path = os.path.join(outFolder,'DatasetList_RasterMode_Class_5.csv')

colList = ['RelElev','GMorph', 'FloodOrd','Slope','d8fllowAcc','HAND','proximity']

for i in range(0,len(basinIDNamesList)):
    id  = basinIDNamesList[i][0]
    name = basinIDNamesList[i][1]
    if id in uniques:
        ## Make out dir for new pair train-validation
        outPath = os.path.join(outFolder,name)
        ms.ensureDirectory(outPath)
        
        ## Extract training set and rescaling to Min-Max
        trainingSet = fullDataset[fullDataset['Aoi_Id']== id] 
        trainingOutputScaled = ms.datasetMinMaxScaler_ByListOfColName_inputDataframe(trainingSet,trainingSet,colList)
        trainingOutput = os.path.join(outPath,name+'_Scaled_train.csv')
        print(trainingOutputScaled.describe())

        ## Extract Validation set and rescaling to Min-Max
        validationSet = fullDataset[fullDataset['Aoi_Id']!= id]
        ValidationOutputScaled = ms.datasetMinMaxScaler_ByListOfColName_inputDataframe(trainingSet,validationSet,colList)
        validationOutput = os.path.join(outPath,name+'_Scaled_valid.csv')
        print(ValidationOutputScaled.describe())

        ## Saving and addinn to list
        trainingOutputScaled.to_csv(trainingOutput,index=None)
        ValidationOutputScaled.to_csv(validationOutput,index=None)
        outDatasetList.append(trainingOutput+','+validationOutput)

ms.createCSVFromList(outDatasetList_path,outDatasetList)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160034 entries, 0 to 160033
Data columns (total 12 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   x_coord     160034 non-null  float64
 1   y_coord     160034 non-null  float64
 2   Cilp        160034 non-null  float64
 3   RelElev     160034 non-null  float64
 4   GMorph      160034 non-null  float64
 5   FloodOrd    160034 non-null  float64
 6   Slope       160034 non-null  float64
 7   d8fllowAcc  160034 non-null  float64
 8   HAND        160034 non-null  float64
 9   proximity   160034 non-null  float64
 10  Labels      160034 non-null  int64  
 11  Aoi_Id      160034 non-null  int64  
dtypes: float64(10), int64(2)
memory usage: 14.7 MB
None
[ 1  2  3  4  5  6  7 10 13 22 23 24 25 26 28 29 30]
Confirmed directory at: C:\Users\abfernan\CrossCanFloodMapping\FloodMappingProjData\HRDTMByAOI\A_DatasetsForMLP\RMA_IndividualBasins\RasterCombined\Class_5\AL_Lethbridge_FullBasin_Cilp_FullDa

  diff_b_a = subtract(b, a)


Confirmed directory at: C:\Users\abfernan\CrossCanFloodMapping\FloodMappingProjData\HRDTMByAOI\A_DatasetsForMLP\RMA_IndividualBasins\RasterCombined\Class_5\BC_Kootenay_Creston_EffectiveBasin_Cilp_FullDataset_Clean_RelElev 
            x_coord        y_coord          Cilp       RelElev        GMorph  \
count  1.023600e+04   10236.000000  10236.000000  10236.000000  10236.000000   
mean  -1.541335e+06  272879.046503    592.438453      0.084899      0.557457   
std    2.258308e+03    2008.075657    109.075831      0.143710      0.304858   
min   -1.545368e+06  267800.000000    528.000000      0.000000      0.000000   
25%   -1.543272e+06  271496.000000    538.000000      0.013175      0.375000   
50%   -1.541688e+06  272952.000000    542.000000      0.018445      0.500000   
75%   -1.539396e+06  274104.000000    618.000000      0.118577      0.875000   
max   -1.536056e+06  277640.000000   1287.000000      1.000000      1.000000   

       FloodOrd         Slope    d8fllowAcc          HAN

  diff_b_a = subtract(b, a)


Confirmed directory at: C:\Users\abfernan\CrossCanFloodMapping\FloodMappingProjData\HRDTMByAOI\A_DatasetsForMLP\RMA_IndividualBasins\RasterCombined\Class_5\BC_Quesnel_FullBasin_Cilp_FullDataset_Clean_RelElev 
            x_coord        y_coord         Cilp      RelElev       GMorph  \
count  6.002000e+03    6002.000000  6002.000000  6002.000000  6002.000000   
mean  -1.767735e+06  826071.776075   568.151450     0.246743     0.671505   
std    3.304034e+03    2008.855033   124.714853     0.301244     0.286752   
min   -1.776648e+06  820456.000000   466.000000     0.000000     0.000000   
25%   -1.769096e+06  824600.000000   471.000000     0.012077     0.500000   
50%   -1.766664e+06  826272.000000   479.000000     0.031401     0.625000   
75%   -1.765720e+06  827512.000000   673.750000     0.501812     0.875000   
max   -1.761848e+06  830600.000000   880.000000     1.000000     1.000000   

       FloodOrd        Slope    d8fllowAcc         HAND    proximity  \
count       0.0  6002.000

  diff_b_a = subtract(b, a)


Confirmed directory at: C:\Users\abfernan\CrossCanFloodMapping\FloodMappingProjData\HRDTMByAOI\A_DatasetsForMLP\RMA_IndividualBasins\RasterCombined\Class_5\BC_Salmo_FullBasin_Cilp_FullDataset_Clean_RelElev 
            x_coord        y_coord          Cilp       RelElev        GMorph  \
count  1.144600e+04   11446.000000  11446.000000  11446.000000  11446.000000   
mean  -1.589809e+06  298853.632011    887.376551      0.232720      0.629292   
std    3.479466e+03    5167.121192    321.556031      0.272736      0.251203   
min   -1.599080e+06  288776.000000    613.000000      0.000000      0.000000   
25%   -1.592344e+06  294232.000000    646.000000      0.027990      0.500000   
50%   -1.590120e+06  298840.000000    694.000000      0.068702      0.625000   
75%   -1.587448e+06  302936.000000   1146.000000      0.452078      0.875000   
max   -1.580984e+06  310312.000000   1792.000000      1.000000      1.000000   

       FloodOrd         Slope    d8fllowAcc          HAND     proximity 

  diff_b_a = subtract(b, a)


Confirmed directory at: C:\Users\abfernan\CrossCanFloodMapping\FloodMappingProjData\HRDTMByAOI\A_DatasetsForMLP\RMA_IndividualBasins\RasterCombined\Class_5\BC_SlocanRiver_FullBasin_Cilp_FullDataset_Clean_RelElev 
            x_coord        y_coord         Cilp      RelElev       GMorph  \
count  8.192000e+03    8192.000000  8192.000000  8192.000000  8192.000000   
mean  -1.591618e+06  353551.324219   865.928467     0.234026     0.644287   
std    8.637059e+03    8409.590716   479.653504     0.270532     0.244003   
min   -1.609512e+06  332120.000000   451.000000     0.000000     0.000000   
25%   -1.599320e+06  348408.000000   515.000000     0.036097     0.500000   
50%   -1.591232e+06  354144.000000   544.000000     0.052453     0.625000   
75%   -1.584856e+06  360168.000000  1232.000000     0.440496     0.875000   
max   -1.570664e+06  370328.000000  2224.000000     1.000000     1.000000   

          FloodOrd        Slope    d8fllowAcc         HAND    proximity  \
count  8192.000000

  diff_b_a = subtract(b, a)


Confirmed directory at: C:\Users\abfernan\CrossCanFloodMapping\FloodMappingProjData\HRDTMByAOI\A_DatasetsForMLP\RMA_IndividualBasins\RasterCombined\Class_5\NS_MargareeRiver_EffectiveBasin_Cilp_FullDataset_Clean_RelElev 
            x_coord        y_coord         Cilp      RelElev       GMorph  \
count  9.762000e+03    9762.000000  9762.000000  9762.000000  9762.000000   
mean   2.519929e+06  390787.069863   122.587789     0.247130     0.636153   
std    3.872776e+03    9483.250488   124.766611     0.253591     0.267637   
min    2.510552e+06  368680.000000     1.000000     0.000000     0.000000   
25%    2.516568e+06  384140.000000    25.000000     0.048780     0.500000   
50%    2.520488e+06  391128.000000    66.000000     0.132114     0.625000   
75%    2.522504e+06  396968.000000   208.000000     0.420732     0.875000   
max    2.529512e+06  409928.000000   493.000000     1.000000     1.000000   

       FloodOrd        Slope    d8fllowAcc         HAND    proximity  \
count       0.

  diff_b_a = subtract(b, a)


Confirmed directory at: C:\Users\abfernan\CrossCanFloodMapping\FloodMappingProjData\HRDTMByAOI\A_DatasetsForMLP\RMA_IndividualBasins\RasterCombined\Class_5\QC_Abitibi_LaSarre_Basin_Cilp_FullDataset_Clean_RelElev 
            x_coord        y_coord        Cilp     RelElev      GMorph  \
count  1.720000e+02     172.000000  172.000000  172.000000  172.000000   
mean   1.148970e+06  121975.627907  268.203488    0.550872    0.546512   
std    3.151740e+02     332.841534    1.144215    0.286054    0.257634   
min    1.148456e+06  121240.000000  266.000000    0.000000    0.000000   
25%    1.148692e+06  121688.000000  268.000000    0.500000    0.333333   
50%    1.148928e+06  122016.000000  268.000000    0.500000    0.500000   
75%    1.149212e+06  122264.000000  269.000000    0.750000    0.666667   
max    1.149624e+06  122552.000000  270.000000    1.000000    1.000000   

         FloodOrd       Slope  d8fllowAcc        HAND   proximity     Labels  \
count  172.000000  172.000000  172.00000

  diff_b_a = subtract(b, a)


Confirmed directory at: C:\Users\abfernan\CrossCanFloodMapping\FloodMappingProjData\HRDTMByAOI\A_DatasetsForMLP\RMA_IndividualBasins\RasterCombined\Class_5\QC_Levis_EffectiveBasin_Cilp_FullDataset_Clean_RelElev 
            x_coord       y_coord        Cilp     RelElev      GMorph  \
count  4.340000e+02    434.000000  434.000000  434.000000  434.000000   
mean   1.789722e+06  85368.331797   44.921659    0.393263    0.688652   
std    8.090559e+02    542.532357    8.027570    0.113064    0.258408   
min    1.787944e+06  83768.000000   17.000000    0.000000    0.000000   
25%    1.789100e+06  85004.000000   40.000000    0.323944    0.500000   
50%    1.789848e+06  85512.000000   42.000000    0.352113    0.625000   
75%    1.790344e+06  85800.000000   49.000000    0.450704    0.875000   
max    1.791320e+06  86568.000000   88.000000    1.000000    1.000000   

         FloodOrd       Slope  d8fllowAcc          HAND   proximity  \
count  434.000000  434.000000  434.000000  4.340000e+02  43

  diff_b_a = subtract(b, a)


Confirmed directory at: C:\Users\abfernan\CrossCanFloodMapping\FloodMappingProjData\HRDTMByAOI\A_DatasetsForMLP\RMA_IndividualBasins\RasterCombined\Class_5\QC_Quebec_FullBasin_Cilp_FullDataset_Clean_RelElev 
            x_coord        y_coord          Cilp       RelElev        GMorph  \
count  1.688400e+04   16884.000000  16884.000000  16884.000000  16884.000000   
mean   1.778131e+06   93600.517413     98.015695      0.394689      0.599006   
std    3.569473e+03    5038.027947     61.647736      0.276447      0.214867   
min    1.769544e+06   81912.000000     10.000000      0.000000      0.000000   
25%    1.775320e+06   89800.000000     30.000000      0.089686      0.500000   
50%    1.778776e+06   94408.000000    111.000000      0.452915      0.500000   
75%    1.781384e+06   97784.000000    152.000000      0.636771      0.875000   
max    1.784856e+06  102312.000000    233.000000      1.000000      1.000000   

           FloodOrd         Slope    d8fllowAcc          HAND     proxi

  diff_b_a = subtract(b, a)


True

## Combining dataSets to build AllVsOne_training and OneVsAll_test

In [None]:
# Concat datasets
sourceFileForDatasets = 'datasets/'
allDataSetsNames = ['basin1Light_Clean.csv', 'basin2_CleanDataSet.csv', 'basin3_CleanDataSet_copy.csv','basin4_CleanDataSet.csv','basin5_CleanDataSet.csv']
# OneVsAllDataSetName = 'basin1Light_Clean.csv'

for datasetForTest in allDataSetsNames:
    filename, file_extension = os.path.splitext(datasetForTest)
    newListOfNames = [s for s in allDataSetsNames if s != datasetForTest]
    allDataSetsFileName = 'allVs_'+ filename +'_Training'
    DFToConcatAll = pd.DataFrame()
#     DFToConcatAll = pd.read_csv((sourceFileForDatasets+datasetForTest), index_col = None)
#     print(DFToConcatAll.head())
    for datasets in newListOfNames:
        DFToConcatAll = pd.concat([DFToConcatAll, pd.read_csv((sourceFileForDatasets+datasets), index_col = None)])
    nameToSafe = sourceFileForDatasets+allDataSetsFileName+file_extension
    DFToConcatAll.drop(['x_coord','y_coord'], axis =1, inplace=True)
    DFToConcatAll.to_csv(nameToSafe, index=None)    
    



## Preparing datasets for MLP

In [None]:
destiationPath = 'datasets/RFdatasets'
listFile = os.listdir(destiationPath)
print(listFile)

In [None]:
testList = ['basin1Light_Clean_Test.csv','basin2_Test.csv','basin3_Test.csv','basin4_Test.csv','basin5_Test.csv']
traininList = ['basin1Light_Clean_Training.csv','basin2_Training.csv','basin3_Training.csv','basin4_Training.csv','basin5_Training.csv']

In [None]:
## Cleaning datasets: Removing not usefull variables from All_VS_ONE 
readPath = 'datasets/RFdatasets/'
destiationPath = 'datasets/dataset4MLP/'
datasetNamelist = ['basin1Light_Clean_Test.csv','basin2_Test.csv','basin3_Test.csv','basin4_Test.csv','basin5_Test.csv',
                  'basin1Light_Clean_Training.csv','basin2_Training.csv','basin3_Training.csv','basin4_Training.csv',
                   'basin5_Training.csv']
featuresToDelete = ['TPI','TWI']
for i in datasetNamelist:
    path = readPath + i 
    basinDataSet = pd.read_csv(path, index_col = None)
    basinDataSet.drop(featuresToDelete, axis=1, inplace=True)
    savePath = destiationPath + 'MLP_'+ i
    basinDataSet.to_csv(savePath, index=None)


In [None]:
###. Build dataset subset for MLP test (Only first 150K samples)
readPath = 'datasets/dataset4MLP/'
datasetNamelist = ['MLP_allVs_basin1Light_Clean_Training.csv','MLP_basin1Light_Clean_VsAll_Test.csv']
for i in datasetNamelist:
    path = readPath + i 
    basinDataSet = pd.read_csv(path, index_col = None)
    Y = np.array(basinDataSet['percentage'])
    count,_ = md.listClassCountPercent(Y)
    basinDataSet.drop(basinDataSet.loc[150000:count].index,axis=0,inplace=True)
    savePath = readPath + 'reduced_'+i
    basinDataSet.to_csv(savePath, index=None)

In [None]:
### Exploring datasets
dataset = ['reduced_MLP_allVs_basin1Light_Clean_Training.csv','reduced_MLP_basin1Light_Clean_VsAll_Test.csv']
for i in dataset:
    path = readPath + i 
    print(path)
    basinDataSet = pd.read_csv(path, index_col = None)
    print(basinDataSet.head())
    Y = np.array(basinDataSet['percentage'])
    print(md.listClassCountPercent(Y))

In [None]:
####.  Training TEST
readPath = 'datasets/dataset4MLP/'
trainingPath = readPath + 'MLP_allVs_basin1Light_Clean_Training.csv'
params = {'random_state':50, 'hidden_layer_sizes': 2,
                'early_stopping':True,'max_iter':200,'verbose':False,
                'tol':0.00010,'validation_fraction':0.1,'warm_start':False}
mlpc = md.implementingMLPCalssifier(trainingPath,'percentage',params)
mlpc.fitMLPClassifier()
mlpc.plotLossBehaviour()

mlpClassifier = mlpc.getMLPClassifier()

# #Validating un unseen datase
# validation = readPath + 'MLP_basin1Light_Clean_VsAll_Test.csv'
# x_val,y_val = ms.importDataSet(validation, 'percentage')
# prediction = ms.makePredictionToImportAsSHP(mlpClassifier, x_val, y_val, 'percentage')

# ## Compute metrics
# X = x_val.copy()
# X.drop(['x_coord','y_coord'], axis=1, inplace=True)
# metrics = md.computeClassificationMetrics(mlpClassifier,X,y_val)

In [None]:
print(mlpc.get_logsDic())
mlpc.logMLPClassifier({'test':34})
print(mlpc.get_logsDic())

In [None]:
prediction.to_csv(('outputs/'+ 'MLP_basin1Light_firstResult_HL280.csv'),index=None)

In [None]:
### Implement bets hiddenLayerSize exploration
readPath = 'datasets/dataset4MLP/'
trainingPath = readPath + 'MLP_allVs_basin1Light_Clean_Training.csv'
validationSet = readPath + 'MLP_basin1Light_Clean_VsAll_Test.csv'
# dataset = pd.read_csv(trainingPath, index_col = None)
params = {'random_state':50, 'hidden_layer_sizes': 2,
                'early_stopping':False,'max_iter':2,'verbose':True,
                'tol':0.00010,'validation_fraction':0.1,'warm_start':False}
mlpc = md.implementingMLPCalssifier(trainingPath,'percentage',params)

x_val,Y_val = ms.importDataSet(validationSet, 'percentage')
X = x_val.copy()
X.drop(['x_coord','y_coord'], axis=1, inplace=True)
firstInterval = np.arange(100,1009,100)
mlpc.explore4BestHLSize(X,Y_val,firstInterval,'5',3)

## Controled sampling

In [None]:
DS = pd.read_csv('datasets/basin2 _Training.csv', index_col = None)
print(DS.head())


In [None]:
print(DS.columns)

In [None]:
plt.boxplot(DS['FAProx_01']) # , , DS['elevation'], DS['disToRiv']]

In [None]:
## Resampling appliying class selection by rule:

# RULE1: Select point at a distance to river less than 300m. 

# # newDS = pseudoClassCreation(DS, "distanceToRiver", 300, 2)
def pseudoClassCreation(dataset, conditionVariable, threshold, pseudoClass, targetClassName):
    '''
    Replace <targetClass> by  <pseudoClass> where <conditionVariable >= threshold>. 
    Return:
      dataset with new classes group. 
    '''
    datsetReclassified = dataset.copy()
    actualTarget = (np.array(dataset[targetClassName])).ravel()
    conditionVar = (np.array(dataset[conditionVariable])).ravel()
    datsetReclassified[targetClassName] = [ pseudoClass if conditionVar[j] >= threshold 
                                           else actualTarget[j]
                                           for j in range(len(actualTarget))]
    print(Counter(datsetReclassified[targetClassName]))
    return  datsetReclassified

def revertPseudoClassCreation(dataset, originalClass, pseudoClass, targetClassName):
    '''
    Restablich  <targetClass> with <originalClass> where <targetClassName == pseudoClass>. 
    Return:
      dataset with original classes group. 
    '''
    datsetReclassified = dataset.copy()
    actualTarget = (np.array(dataset[targetClassName])).ravel()
    datsetReclassified[targetClassName] = [ originalClass if actualTarget[j] == pseudoClass
                                           else actualTarget[j]
                                           for j in range(len(actualTarget))]
    print(Counter(datsetReclassified[targetClassName]))
    return  datsetReclassified


print(Counter(X_train['percentage']))
newDS = pseudoClassCreation(X_train, 'disToRiv', 200, 2, 'percentage')
y = newDS['percentage']
newDS.drop(['percentage'], axis=1, inplace = True)
x_res,y_res = ms.randomUndersampling(newDS, y, )
x_res['percentage'] = y_res
# newDatase = revertPseudoClassCreation(x_res, 0, 2, 'percentage')


In [None]:
x_res.to_csv('basin1ControlClass0Sampling4Class_ToSHP.csv',index = None)

# Data description and visualization

In [None]:
#### import dataset to describe
DS= pd.read_csv('datasets/RFDatasets/basin5_CleanDataSet.csv', index_col=None)
DS.drop(['x_coord','y_coord'], axis=1, inplace=True)
DS.head()

In [None]:
####. Covariance Matrix
fig, ax = plt.subplots(figsize=(15, 10))
ax.font_scale = 9
# sns.set(font_scale=1.5)
matrix = DS.corr().round(2)
sns.heatmap(matrix, annot=True, linewidth=1)


In [None]:
DS.drop(['x_coord','y_coord'], axis = 1, inplace=True)
DS.head()

In [None]:
### FAcc vs Labels
targets = DS['percentage']
colList = ['FAProx_01','FAProx_025','FAcc']
fig, axs = plt.subplots(1,3, figsize=(15,5), sharey=True)
fig.text(-0.02, 0.5, 'Flood probability (%)', va='center', rotation='vertical')
fig.text(0.5, 1, 'Density Lines vs labels distribution', ha ='center')
j=0
for i in colList:
    axs[j].scatter(DS[i],targets)
    # axs[j].set_title(i)
    axs[j].set(xlabel= i)
    j+=1

plt.rcParams['font.size'] = '20'
fig.tight_layout()


In [None]:

## Plot all features vs labels
# 'disToRiv', 'TWI', 'TPI', 'slope', 'elevation',

targets = DS['percentage']
# targets = np.where(targets == 5,2,targets)

E = DS['elevation'] 
slope = DS['slope']
FAcc = DS['FAcc']
TWI = DS['TWI']
TPI = DS['TPI']
DLSOL4R150 = DS['LDSOL4R150']
DLSOL5R150 = DS['LDSOL5R150']
DLSOL5R200 = DS['LDSOL5R200']
FAProx_01 = DS['FAProx_01']
FAProx_025 = DS['FAProx_025']
visibility = DS['visibility']

fig, axs = plt.subplots(4,3, figsize=(13, 8), sharey=True)
fig.supylabel('Labels')
plt.rcParams['font.size'] = '15'
plt.yticks([0,1,5])

'''
E = DS['elevation'] 
slope = DS['slope']
FAcc = DS['FAcc']
TWI = DS['TWI']
'''
axs[0, 0].scatter(E,targets)
axs[0, 0].set_title("Elevation")
axs[1, 0].scatter(slope,targets)
axs[1, 0].set_title("Slope")
axs[2, 0].scatter(FAcc,targets)
axs[2, 0].set_title("Flow accumulation")
axs[3, 0].scatter(TWI,targets)
axs[3, 0].set_title("TWI")

'''
TPI = DS['TPI']
DLSOL4R150 = DS['DLSOL4R150']
DLSOL5R150 = DS['DLSOL5R150']
DLSOL5R200 = DS['DLSOL5R200']
'''
axs[0, 1].scatter(TPI,targets)
axs[0, 1].set_title('TPI')
axs[1, 1].scatter(DLSOL4R150,targets)
axs[1, 1].set_title("DLSOL4R150")
axs[2, 1].scatter(DLSOL5R150,targets)
axs[2, 1].set_title("DLSOL5R150")
axs[3, 1].scatter(DLSOL5R200,targets)
axs[3, 1].set_title("DLSOL5R200")

'''
FAProx_01 = DS['FAProx_01']
FAProx_025 = DS['FAProx_025']
visibility = DS['visibility']
'''
axs[0, 2].scatter(FAProx_01,targets)
axs[0, 2].set_title('FAProx_01')
axs[1, 2].scatter(FAProx_025,targets)
axs[1, 2].set_title("FAProx_025")
axs[2, 2].scatter(visibility,targets)
axs[2, 2].set_title("Visibility")

fig.tight_layout()


In [None]:
print(DS.head())
#  Return a dataset with the rows corresponding to the index where condition in DS.columName is valid. 
dsArray = DS[DS.percentage != 0] 
print(dsArray.head()) 

In [None]:
### Pairplot ###

sns.set(font_scale=1.5)
sns.pairplot(DS, hue = 'percentage', diag_kind = 'kde', 
             plot_kws = {'alpha': 0.8, 's': 100},
             height = 4, corner=True, palette = "Set2")# vars = ['life_exp', 'log_pop', 'log_gdp_per_cap'],

# sns.pairplot(DS, hue="percentage")

In [None]:
### Pairplot ###

sns.set(font_scale=1.5)
sns.pairplot(DS, hue = 'percentage', diag_kind = 'kde', 
             plot_kws = {'alpha': 0.8, 's': 100},
             height = 4, corner=True, palette = "Set2")# vars = ['life_exp', 'log_pop', 'log_gdp_per_cap'],

# sns.pairplot(DS, hue="percentage")

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier

estimator = RandomForestClassifier(criterion='entropy', random_state = 50)
x_train,y_train = ms.importDataSet('basin1Train.csv', 'percentage')
classifier = OneVsRestClassifier(estimator).fit(x_train,y_train)


In [None]:
classifier = ms.loadModel('outputs/2022-11-01/10-51-40/2211011051.pkl')
x_test,y_test = ms.importDataSet('datasets/datasets4MLP_Binary/MLPBinary_basin1_Test.csv', 'percentage')

x_test = ms.removeCoordinatesFromDataSet(x_test)

# y_prob = classifier.predict_proba(x_test)
#print(np.unique(y_prob))

md.plot_ROC_AUC(classifier, x_test, y_test)

In [None]:
#### ROC_AUC binary for multiples results in the same figure 

fig, axs = plt.subplots(1,figsize=(13,4), sharey=True)
plt.rcParams.update({'font.size': 14})
plt.ylabel('True Positive Rate', fontsize=16)
plt.xlabel('False Positive Rate', fontsize=16)
plt.figure(0).clf()
axs.set_title('MLP binary in Basin 5')

classifierList = ['outputs/2022-11-02/09-30-03/2211020930.pkl', 'outputs/2022-10-31/09-57-31/2210310957.pkl']
testSetList = ['datasets/datasets4MLP_Binary/MLPBinary_basin5_Test.csv', 'datasets/datasets4MLP_Binary/MLPBinary_basin5_VsAll_Test.csv']
nameList = ['Intra-Basins','one-vs-rest']
for i in range(len(classifierList)):
    classifier = ms.loadModel(classifierList[i])
    x_test,y_test = ms.importDataSet(testSetList[i], 'percentage')
    x_test = ms.removeCoordinatesFromDataSet(x_test)
    y_prob = classifier.predict_proba(x_test)  
    y_hat = classifier.predict(x_test)
    fpr,tpr,thresholds = metrics.roc_curve(y_test, y_prob[:,1], drop_intermediate=False) 
    print(thresholds)
    roc_auc = roc_auc_score(y_test, y_hat, average = "macro")
    axs.plot(fpr,tpr,label = str(nameList[i]) + " AUC : " + format(roc_auc,".4f")) 
    axs.legend()

### Inference 

In [None]:
modelsFolder = r'C:\Users\abfernan\CrossCanFloodMapping\FloodMappingProjData\HRDTMByAOI\MLP_Models'
modelName = '2401251538'
model = ms.loadModel(os.path.join(modelsFolder,modelName+'.pkl'))
datasetPath = r'C:\Users\abfernan\CrossCanFloodMapping\FloodMappingProjData\HRDTMByAOI\A_DatasetsForMLP\RegionalModelingApplication\AL_Lethbridge_RMA_ScaledWith_RastMode_Class1.csv'
inDataFrame = pd.read_csv(datasetPath, index_col=None)
print(inDataFrame.describe())
colsToDrop = ['x_coord','y_coord']
y_hat = md.inferenceMLP(datasetPath,model=model,colsToDrop=colsToDrop)
# print(metrics)


In [None]:
## Prepare output dataset for QGIS
outDataFrame = pd.DataFrame()
inDataFrame = pd.read_csv(datasetPath, index_col=None)
# colsToDrop.append('Labels')
outDataFrame = inDataFrame[colsToDrop]

outDataFrame['y_hat'] = y_hat

# print(outDataFrame.describe())
# RMA_outputs =r'C:\Users\abfernan\CrossCanFloodMapping\FloodMappingProjData\HRDTMByAOI\A_DatasetsForMLP\RegionalModelingApplication\RMA_outputs'
_,name,ext = ms.get_parenPath_name_ext(datasetPath)

bestModelsInference = r'C:\Users\abfernan\CrossCanFloodMapping\FloodMappingProjData\HRDTMByAOI\A_DatasetsForMLP\RegionalModelingApplication\BestModelsApplication'
outPrediction = os.path.join(bestModelsInference,name+'_RMA_'+modelName+ext)

#r'C:\Users\abfernan\CrossCanFloodMapping\FloodMappingProjData\HRDTMByAOI\A_DatasetsForMLP\RegionalModelingApplication\RMA_outputs\QC_Plessisville_FullBasin_RMA_Scaled_Class_5_RMA_2401161113.csv'
print(outDataFrame.describe())
outDataFrame.to_csv(outPrediction, index=False)

ms.buildShapefilePointFromCsvDataframe(outPrediction)

In [None]:
ms.buildShapefilePointFromCsvDataframe(outPrediction)