In [1]:
import os
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt
import myServices as ms
import models as rfr
import numpy as np
import pandas as pd
import sklearn
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,roc_curve, auc, roc_auc_score, f1_score
import joblib

In [2]:
# to compute ececution time do:
# with timeit():
#     # your code, e.g., 
class timeit(): 
    from datetime import datetime
    def __enter__(self):
        self.tic = self.datetime.now()
    def __exit__(self, *args, **kwargs):
        print('runtime: {}'.format(self.datetime.now() - self.tic))

## Importing and manipulating datasets

In [None]:
sklearn.metrics.get_scorer_names()

In [29]:
DS = pd.read_csv('datasetBasin1.csv', index_col = None)
print(DS.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2493980 entries, 0 to 2493979
Data columns (total 9 columns):
 #   Column      Dtype  
---  ------      -----  
 0   percentage  int64  
 1   disToRiv    float64
 2   TWI         float64
 3   TPI         float64
 4   FAcc        float64
 5   slope       float64
 6   elevation   float64
 7   x_coord     int64  
 8   y_coord     int64  
dtypes: float64(6), int64(3)
memory usage: 171.2 MB
None


In [30]:
DS = DS.dropna()
print(DS.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2479538 entries, 5 to 2493974
Data columns (total 9 columns):
 #   Column      Dtype  
---  ------      -----  
 0   percentage  int64  
 1   disToRiv    float64
 2   TWI         float64
 3   TPI         float64
 4   FAcc        float64
 5   slope       float64
 6   elevation   float64
 7   x_coord     int64  
 8   y_coord     int64  
dtypes: float64(6), int64(3)
memory usage: 189.2 MB
None


In [32]:
DS.to_csv('datasetBasin1_NoDataFree.csv', index = None)

In [33]:
DS.describe()

Unnamed: 0,percentage,disToRiv,TWI,TPI,FAcc,slope,elevation,x_coord,y_coord
count,2479538.0,2479538.0,2479538.0,2479538.0,2479538.0,2479538.0,2479538.0,2479538.0,2479538.0
mean,0.09201795,208.4064,9.483515,-0.0003328848,43527.6,4.821367,145.3463,365082.8,5264239.0
std,0.6610626,171.385,4.081659,0.1949152,1031605.0,4.994698,45.69706,3169.308,1824.893
min,0.0,0.0,-9.25212,-7.46132,25.0,0.00047,-0.03685,358678.0,5260218.0
25%,0.0,73.82411,5.55319,-0.06226,88.92947,1.60076,115.7656,362313.0,5262723.0
50%,0.0,167.7051,10.25636,-0.00027,288.1975,3.192175,148.1632,365073.0,5264348.0
75%,0.0,301.0399,12.8933,0.06091,1265.309,6.192805,179.7259,367628.0,5265783.0
max,5.0,1349.713,24.86353,6.70697,62824500.0,70.02943,260.5691,372118.0,5267978.0


## balanced sampling

In [2]:
## Stratified Split
from sklearn.model_selection import StratifiedShuffleSplit

X,Y = ms.importDataSet('datasetBasin1_NoDataFree.csv', 'percentage')

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=50)
for train_index, test_index in sss.split(X, Y):
    print("TRAIN:", train_index.size, "TEST:", test_index)
    X_train = X.iloc[train_index]
    y_train = Y.iloc[train_index]
    X_test = X.iloc[test_index]
    y_test = Y.iloc[test_index]

TRAIN: 1983630 TEST: [1149772 1110309   98254 ... 2117964  401271  124321]


In [19]:
basin1TestBalanced = X_test
basin1TestBalanced['percentage'] = y_test
print(X_test.head())
basin1TestBalanced.to_csv('basin1TestForBanlanced.csv', index=None)

          disToRiv       TWI      TPI        FAcc     slope  elevation  \
1149772   65.76473  15.88676  0.10242  7934.72021   5.60282  108.43757   
1110309  579.82758   9.04799  0.01117  1758.07971   1.66141  165.68309   
98254     65.19202   1.79303  0.54303    64.37885  13.80903  142.59601   
242062   215.92822  13.59605 -0.12878   802.94952   1.87882  166.73584   
1093319  166.17009   3.08946  0.01888   275.23352   5.40569  170.24129   

         x_coord  y_coord  percentage  
1149772   364493  5264573           0  
1110309   369238  5264668           0  
98254     371563  5267053           0  
242062    362363  5266648           0  
1093319   360698  5264703           0  


In [3]:
print( y_train.value_counts())
print(y_test.value_counts())

0    1941324
5      35056
1       7250
Name: percentage, dtype: int64
0    485332
5      8764
1      1812
Name: percentage, dtype: int64


In [4]:
## This proportions are the reason why a sample_weight of 0.01 for the majority class give best results for regression

totalTrain = sum([1941324, 35056, 7250])
totalValidation = sum([485332,8764, 1812])
print("Balance summary of train dataset")
print(f"Class 1% : {1941324/totalTrain}, Class 1%: {7250/totalTrain}, Class 5%: {35056/totalTrain}")
print(f"Class 1% : {485332/totalValidation}, Class 1%: {1812/totalValidation}, Class 5%: {8764/totalValidation}")

Balance summary of train dataset
Class 1% : 0.978672433871236, Class 1%: 0.0036549154832302396, Class 5%: 0.017672650645533694
Class 1% : 0.9786734636263178, Class 1%: 0.003653903546625584, Class 5%: 0.017672632827056632


In [32]:
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler 

sm = RandomUnderSampler(random_state=50,sampling_strategy='auto')
X_res, y_res = sm.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(y_res))

Resampled dataset shape Counter({0: 7250, 1: 7250, 5: 7250})


In [33]:

X_train['percentage'] = y_train
print(X_res.head())
X_train.drop(['x_coord','y_coord'], axis=1, inplace = True)

X_train.to_csv('basin1TrainUnbalanced.csv', index=None)

    disToRiv       TWI      TPI         FAcc     slope  elevation  percentage
0  395.37958  16.60988 -0.15622  16352.31836   2.55812  128.22772           0
1  140.08926   4.47640  0.07680    195.21506   9.22754  108.65261           0
2  105.00000   4.83475  0.37084    146.69675  11.37751   65.99345           0
3   25.00000  10.97939 -0.00449  27846.66602   0.28085  186.80829           0
4   90.13878   5.02442 -0.19621    121.64829   9.13241  124.75165           0


KeyError: "['x_coord', 'y_coord'] not found in axis"

In [13]:
X_res['percentage'] = y_res
print(X_res.head())
X_res.drop(['x_coord','y_coord'], axis=1, inplace = True)

X_res.to_csv('basin1TrainingBanlanced.csv', index=None)

X,Y = ms.importDataSet('basin1TrainingBanlanced.csv', 'percentage')

    disToRiv       TWI      TPI         FAcc     slope  elevation  x_coord  \
0  395.37958  16.60988 -0.15622  16352.31836   2.55812  128.22772   367968   
1  140.08926   4.47640  0.07680    195.21506   9.22754  108.65261   367903   
2  105.00000   4.83475  0.37084    146.69675  11.37751   65.99345   367583   
3   25.00000  10.97939 -0.00449  27846.66602   0.28085  186.80829   360483   
4   90.13878   5.02442 -0.19621    121.64829   9.13241  124.75165   364448   

   y_coord  percentage  
0  5263198           0  
1  5263573           0  
2  5265203           0  
3  5263168           0  
4  5264078           0  


In [14]:
print(X_res.head())

    disToRiv       TWI      TPI         FAcc     slope  elevation  percentage
0  395.37958  16.60988 -0.15622  16352.31836   2.55812  128.22772           0
1  140.08926   4.47640  0.07680    195.21506   9.22754  108.65261           0
2  105.00000   4.83475  0.37084    146.69675  11.37751   65.99345           0
3   25.00000  10.97939 -0.00449  27846.66602   0.28085  186.80829           0
4   90.13878   5.02442 -0.19621    121.64829   9.13241  124.75165           0


In [23]:
X,Y = ms.importDataSet('basin1TestForBanlanced.csv', 'percentage')
print('Resampled dataset shape %s' % Counter(Y))

Resampled dataset shape Counter({0: 485332, 5: 8764, 1: 1812})


In [34]:
model = ms.loadModel('./outputs/2022-08-05/00-35-58/2208050035.pkl')
dataSetToSave = ms.makePredictionToImportAsSHP(csvName, model, X, Y, 'percentage')
print(dataSetToSave.head())

NameError: name 'csvName' is not defined

In [26]:
dataSetToSave.to_csv('basin1ClassifierPrediction1_balanced.csv', index = None)

In [27]:
print(model)

RandomForestClassifier(bootstrap=False, criterion='entropy', max_features=0.6,
                       n_estimators=87, random_state=50)


In [None]:
x_train['percentage'] = y_train
print(x_train.head())

In [None]:
x_train.to_csv('basin1Train.csv', index = None)

In [None]:
RFClassifier = rfr.implementRandomForestCalssifier()

In [None]:
model = rfr.loadModel('./bestModels/21-46-40 Classifier/2207232146.pkl')

In [None]:
print(model)

# Data description and visualization

In [None]:
### FAcc vs Labels
targets = DS['percentage']
FAcc = original['FAcc']
FAcc_norm = DS['FAcc_norm']
fig, axs = plt.subplots(1, 2, figsize=(13,4), sharey=True)
fig.text(-0.02, 0.5, 'labels', va='center', rotation='vertical')
fig.text(0.5, 1, 'Flow accumulation vs labels distribution', ha ='center')
axs[0].scatter(FAcc,targets)
# axs[0].set_title("Facc")
axs[0].set(xlabel='a) Flow Accumulation')
axs[1].scatter(FAcc_norm,targets)
# axs[1].set_title("FAcc_norm")
axs[1].set(xlabel='b) Flow Accumulation estandardized')
plt.rcParams['font.size'] = '20'
fig.tight_layout()


In [None]:
##3 Al features vs labels
# 'disToRiv', 'TWI', 'TPI', 'slope', 'elevation',
targets = DS['percentage']
# targets = np.where( targets == 5, 2,targets)

E = DS['elevation'] 
slope = DS['slope']
FAcc = DS['FAcc_norm']
S = DS['disToRiv']
TWI = DS['TWI']
TPI = DS['TPI']

fig, axs = plt.subplots(3, 2, figsize=(13, 8), sharey=True)
fig.supylabel('Labels')
plt.rcParams['font.size'] = '15'
plt.yticks([0,1,5])
axs[0, 0].scatter(E,targets)
axs[0, 0].set_title("Elevation")
axs[1, 0].scatter(slope,targets)
axs[1, 0].set_title("Slope")
axs[2, 0].scatter(FAcc,targets)
axs[2, 0].set_title("Flow accumulation")
axs[0, 1].scatter(S,targets)
axs[0, 1].set_title("Distance to river")
axs[1, 1].scatter(TWI,targets)
axs[1, 1].set_title("TWI")
axs[2, 1].scatter(TPI,targets)
axs[2, 1].set_title('TPI')
fig.tight_layout()


In [None]:
print(DS.head())
#  Return a dataset with the rows corresponding to the index where condition in DS.columName is valid. 
dsArray = DS[DS.percentage != 0] print(dsArray.head())
print(dsArray.head()) 

In [None]:
sns.set(font_scale=1.5)
sns.pairplot(DS, hue = 'percentage', diag_kind = 'kde', 
             plot_kws = {'alpha': 0.8, 's': 100},
             height = 4, corner=True, palette = "Set2")# vars = ['life_exp', 'log_pop', 'log_gdp_per_cap'],

# sns.pairplot(DS, hue="percentage")

In [None]:
####. Covariance Matrix
matrix = DS.corr().round(2)
sns.heatmap(matrix, annot=True)
plt.show()

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier

estimator = RandomForestClassifier(criterion='entropy', random_state = 50)
x_train,y_train = ms.importDataSet('basin1Train.csv', 'percentage')
classifier = OneVsRestClassifier(estimator).fit(x_train,y_train)


In [None]:
classifier = rfr.loadModel('./outputs/2022-08-05/00-35-58/2208050035.pkl')
x_test,y_test = ms.importDataSet('basin1Validation.csv', 'percentage')
y_prob = classifier.predict_proba(x_test)
#print(np.unique(y_prob))

In [None]:
rfr.plot_ROC_AUC_OneVsRest(classifier, x_test, y_test)