In [73]:
# Import Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [74]:
derived_data = pd.read_csv("Waterbase_v2018_1_WISE4_csv/Waterbase_v2019_1_S_WISE2_SpatialObject_DerivedData.csv")
data         = pd.read_csv("Cleaned_EQR.csv")

In [75]:
derived_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11953 entries, 0 to 11952
Data columns (total 22 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   countryCode                     11953 non-null  object 
 1   thematicIdIdentifier            11953 non-null  object 
 2   thematicIdIdentifierScheme      11953 non-null  object 
 3   monitoringSiteIdentifier        11492 non-null  object 
 4   monitoringSiteIdentifierScheme  11492 non-null  object 
 5   monitoringSiteName              11492 non-null  object 
 6   waterBodyIdentifier             11819 non-null  object 
 7   waterBodyIdentifierScheme       11819 non-null  object 
 8   waterBodyName                   11819 non-null  object 
 9   specialisedZoneType             11819 non-null  object 
 10  naturalAWBHMWB                  11425 non-null  object 
 11  reservoir                       11425 non-null  object 
 12  surfaceWaterBodyTypeCode        

In [76]:
derived_data.sample(5)

Unnamed: 0,countryCode,thematicIdIdentifier,thematicIdIdentifierScheme,monitoringSiteIdentifier,monitoringSiteIdentifierScheme,monitoringSiteName,waterBodyIdentifier,waterBodyIdentifierScheme,waterBodyName,specialisedZoneType,...,surfaceWaterBodyTypeCode,subUnitIdentifier,subUnitIdentifierScheme,subUnitName,rbdIdentifier,rbdIdentifierScheme,rbdName,confidentialityStatus,lat,lon
5771,LT,LTR1577,euMonitoringSiteCode,LTR1577,euMonitoringSiteCode,SESUPE UPSTREAM AUKSTOJI BUKTA,LT150100013,euSurfaceWaterBodyCode,SESUPE,riverWaterBody,...,RWT3,LT111500000,euSubUnitCode,SESUPES PABASEINIS,LT1100,euRBDCode,NEMUNAS,F,54.43278,23.35849
9630,UK,UK7742,eionetMonitoringSiteCode,UK7742,eionetMonitoringSiteCode,UNKNOWN,,,,,...,,,,,,,,unknown,,
67,AT,ATFW80207027,euMonitoringSiteCode,ATFW80207027,euMonitoringSiteCode,"BREGENZERACH, BREGENZ",ATOK100930000,euSurfaceWaterBodyCode,BREGENZERACH,riverWaterBody,...,"MZB_9_1,75",AT2100,euSubUnitCode,RHINE,AT2000,euRBDCode,RHINE,F,47.48693,9.72175
6990,PL,PL01S0701_1119,euMonitoringSiteCode,PL01S0701_1119,euMonitoringSiteCode,GŁOSKÓWKA - GŁOSKÓW-MOST NA DRODZE PIASECZNO-R...,PLRW200017258529,euSurfaceWaterBodyCode,GŁOSKÓWKA,riverWaterBody,...,17,PL2000,euSubUnitCode,VISTULA RIVER BASIN DISTRICT,PL2000,euRBDCode,VISTULA RIVER BASIN DISTRICT,F,52.03,20.96028
3673,IE,IEGBNIIENW_35_160,euMonitoringSiteCode,IEGBNIIENW_35_160,euMonitoringSiteCode,MELVIN,IENW_35_160,euSurfaceWaterBodyCode,MELVIN,lakeWaterBody,...,Lake Type 8,IEGBNIIENW,euSubUnitCode,NORTH WESTERN,IEGBNIIENW,euRBDCode,NORTH WESTERN,N,,


In [77]:
# From The data definition file there are some ineteresting column that we could look into

# 1. countryCode                       (Abbreviation of EEA Member or Collaborating Country)
# 2. monitoringSiteIdentifier          (Unique international identifier of the monitoring site) (Used to connect 2 datasets together)
# 3. specialisedZoneType               (Additional classification value which further specialises the type of management, regulation or restriction zone, represented by this spatial object)
# 4. naturalAWBHMWB                    (Specification of whether a water body is identified as natural, artificial (AWB) or heavily modified (HMWB))
# 5. reservoir                         (For heavily modified river or lake water bodies, the value indicates whether the water body is a reservoir that has been created by damming a river or an existing lake)


specialisedZoneType

In [78]:
# Check For Data that is in Specialised Zone Type
derived_data["specialisedZoneType"].value_counts()

riverWaterBody           9090
lakeWaterBody            1792
coastalWaterBody          501
transitionalWaterBody     436
Name: specialisedZoneType, dtype: int64

In [79]:
# The column contain the data about costalWaterBody and transitionalWaterBody which we didn't used in the previous data
# Therefore we will drop both costalWaterBody and transitionalWaterBody

derived_data = derived_data[(derived_data["specialisedZoneType"] == "riverWaterBody") | (derived_data["specialisedZoneType"] == "lakeWaterBody")]

In [80]:
# Rename riverWaterBody and lakeWaterBody for later
derived_data["specialisedZoneType"].replace("riverWaterBody","RW",inplace=True)
derived_data["specialisedZoneType"].replace("lakeWaterBody","LW",inplace=True)

In [81]:
# Rename the column to CountryCode inorder to make it easier to merge between each dataframe
derived_data.rename(columns={"specialisedZoneType": "parameterWaterBodyCategory"},inplace=True)

In [82]:
derived_data["parameterWaterBodyCategory"].value_counts()

RW    9090
LW    1792
Name: parameterWaterBodyCategory, dtype: int64

countryCode

In [83]:
# Rename the column to CountryCode inorder to make it easier to merge between each dataframe
derived_data.rename(columns={"countryCode": "CountryCode"},inplace=True)

In [84]:
# Check Each Unique Country
derived_data["CountryCode"].unique()

array(['AT', 'BE', 'BG', 'CH', 'CY', 'DE', 'DK', 'EE', 'ES', 'FI', 'FR',
       'HR', 'IE', 'IT', 'LT', 'LU', 'LV', 'NL', 'NO', 'PL', 'PT', 'RO',
       'SE', 'SI', 'SK', 'UK'], dtype=object)

In [85]:
# Check Unique Country in Previous Cleaned Dataset
data["CountryCode"].unique()

array(['AT', 'BE', 'BG', 'CY', 'DE', 'EE', 'ES', 'FI', 'FR', 'HR', 'IE',
       'IT', 'LT', 'LU', 'LV', 'NL', 'NO', 'PL', 'PT', 'RO', 'SE', 'SI',
       'SK', 'UK'], dtype=object)

In [86]:
# Check For Each Unique Country in each Dataset
len_uni_EQR_RW     = len(data[data["parameterWaterBodyCategory"] == "RW"]["CountryCode"].unique())
len_uni_EQR_LW     = len(data[data["parameterWaterBodyCategory"] == "LW"]["CountryCode"].unique())
len_uni_DER_RW     = len(derived_data[derived_data["parameterWaterBodyCategory"] == "RW"]["CountryCode"].unique())
len_uni_DER_LW     = len(derived_data[derived_data["parameterWaterBodyCategory"] == "LW"]["CountryCode"].unique())

# Current File
print(f"Number of Unique Country in DER (RW) Dataset : {len_uni_DER_RW}")
print(f"Number of Unique Country in DER (LW) Dataset : {len_uni_DER_LW}")

# Previous File
print(f"Number of Unique Country in EQR (RW) Dataset : {len_uni_EQR_RW}")
print(f"Number of Unique Country in EQR (LW) Dataset : {len_uni_EQR_LW}")

Number of Unique Country in DER (RW) Dataset : 26
Number of Unique Country in DER (LW) Dataset : 20
Number of Unique Country in EQR (RW) Dataset : 24
Number of Unique Country in EQR (LW) Dataset : 16


In [87]:
# First Check the value that are in EQR Dataset But not in DER dataset

RW_does_not_contain = []
LW_does_not_contain = []

for RW in data[data["parameterWaterBodyCategory"] == "RW"]["CountryCode"].unique():
    if RW not in derived_data[derived_data["parameterWaterBodyCategory"] == "RW"]["CountryCode"].unique():
        RW_does_not_contain.append(RW)

for LW in data[data["parameterWaterBodyCategory"] == "LW"]["CountryCode"].unique():
    if LW not in derived_data[derived_data["parameterWaterBodyCategory"] == "LW"]["CountryCode"].unique():
        LW_does_not_contain.append(LW)

print(f"Countries that are in the EQR (RW) Dataset but not in DER (RW) dataset: {RW_does_not_contain}")
print(f"Countries that are in the EQR (LW) Dataset but not in DER (LW) dataset: {LW_does_not_contain}")

Countries that are in the EQR (RW) Dataset but not in DER (RW) dataset: []
Countries that are in the EQR (LW) Dataset but not in DER (LW) dataset: ['CY']


In [88]:
# Check the shape of CY rows in the EQR dataset
data[(data["CountryCode"] == "CY") & (data["parameterWaterBodyCategory"] == "LW")]

Unnamed: 0,CountryCode,parameterICStatusOfDeterminandBiologyEQR,parameterBoundaryValueClasses12,parameterBoundaryValueClasses23,parameterBoundaryValueClasses34,parameterBoundaryValueClasses45,monitoringSiteIdentifier,parameterWaterBodyCategory,parameterNCSWaterBodyType,resultEcologicalStatusClassValue,result_convert_EQR,UID
24470,CY,0.555556,0.8,0.6,0.4,0.2,CYD2-2-6-85,LW,425.0,2,1,43423
24471,CY,0.555556,0.8,0.6,0.4,0.2,CYD2-2-6-85,LW,425.0,2,1,27530
24472,CY,0.555556,0.8,0.6,0.4,0.2,CYD2-2-6-85,LW,425.0,2,1,27538


In [89]:
# There are only 3 rows therefore we drop that rows
data = data[~((data["CountryCode"] == "CY") & (data["parameterWaterBodyCategory"] == "LW"))]
data.shape

(29007, 12)

In [90]:
# For this time check the country code that is existed in DER but not in Data

RW_does_not_contain = []
LW_does_not_contain = []

for RW in derived_data[derived_data["parameterWaterBodyCategory"] == "RW"]["CountryCode"].unique():
    if RW not in data[data["parameterWaterBodyCategory"] == "RW"]["CountryCode"].unique():
        RW_does_not_contain.append(RW)

for LW in derived_data[derived_data["parameterWaterBodyCategory"] == "LW"]["CountryCode"].unique():
    if LW not in data[data["parameterWaterBodyCategory"] == "LW"]["CountryCode"].unique():
        LW_does_not_contain.append(LW)

print(f"Countries that are in the EQR (RW) Dataset but not in DER (RW) dataset: {RW_does_not_contain}")
print(f"Countries that are in the EQR (LW) Dataset but not in DER (LW) dataset: {LW_does_not_contain}")

Countries that are in the EQR (RW) Dataset but not in DER (RW) dataset: ['CH', 'DK']
Countries that are in the EQR (LW) Dataset but not in DER (LW) dataset: ['DE', 'FR', 'IT', 'NO', 'PT']


In [91]:
# Categorize data into 2 datasets data_RW and data_LW
derived_data_RW = derived_data[derived_data["parameterWaterBodyCategory"] == "RW"]
derived_data_LW = derived_data[derived_data["parameterWaterBodyCategory"] == "LW"]

In [92]:
# Drop these values

# RW
derived_data_RW = derived_data_RW[derived_data_RW["CountryCode"] != "CH"]
derived_data_RW = derived_data_RW[derived_data_RW["CountryCode"] != "DK"]

# LW
derived_data_LW = derived_data_LW[derived_data_LW["CountryCode"] != "DK"]
derived_data_LW = derived_data_LW[derived_data_LW["CountryCode"] != "DE"]
derived_data_LW = derived_data_LW[derived_data_LW["CountryCode"] != "FR"]
derived_data_LW = derived_data_LW[derived_data_LW["CountryCode"] != "IT"]
derived_data_LW = derived_data_LW[derived_data_LW["CountryCode"] != "DK"]

In [93]:
# Concat the dataframe
derived_data = pd.concat([derived_data_RW,derived_data_LW])
derived_data.shape

(10699, 22)

naturalAWBHMWB

In [94]:
derived_data["naturalAWBHMWB"].value_counts()

Natural    7698
HMWB       2484
AWB         223
Name: naturalAWBHMWB, dtype: int64

In [95]:
# Create Dummies for naturalAWBHMWB column
derived_data = pd.get_dummies(derived_data,columns=["naturalAWBHMWB"])

reservoir

In [96]:
# Do the same for reserviour
derived_data["reservoir"].value_counts()

Missing                                          7211
Not a reservoir                                  2523
Unpopulated                                       468
Reservoir in former river                         184
Reservoir in former lake                           15
Reservoir in former chain of rivers and lakes       4
Name: reservoir, dtype: int64

In [97]:
# Create a dummmies for this feature too
derived_data = pd.get_dummies(derived_data,columns=["reservoir"])

In [98]:
# Cut down the unused columns
cols = ['CountryCode',
        'monitoringSiteIdentifier',
        'parameterWaterBodyCategory',
        'naturalAWBHMWB_AWB',
        'naturalAWBHMWB_HMWB', 
        'naturalAWBHMWB_Natural', 
        'reservoir_Missing',
        'reservoir_Not a reservoir',
        'reservoir_Reservoir in former chain of rivers and lakes',
        'reservoir_Reservoir in former lake',
        'reservoir_Reservoir in former river', 
        'reservoir_Unpopulated',
        ]

In [99]:
derived_data = derived_data[cols]

In [100]:
derived_data

Unnamed: 0,CountryCode,monitoringSiteIdentifier,parameterWaterBodyCategory,naturalAWBHMWB_AWB,naturalAWBHMWB_HMWB,naturalAWBHMWB_Natural,reservoir_Missing,reservoir_Not a reservoir,reservoir_Reservoir in former chain of rivers and lakes,reservoir_Reservoir in former lake,reservoir_Reservoir in former river,reservoir_Unpopulated
0,AT,ATFW10000027,RW,0,0,1,0,1,0,0,0,0
1,AT,ATFW10000077,RW,0,0,1,0,1,0,0,0,0
2,AT,ATFW10000087,RW,0,0,1,0,1,0,0,0,0
3,AT,ATFW10000177,RW,0,0,1,0,1,0,0,0,0
4,AT,ATFW21500097,RW,0,1,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
11933,UK,UKSC320717,LW,0,0,1,1,0,0,0,0,0
11944,UK,UKSC350177,LW,0,1,0,0,1,0,0,0,0
11945,UK,UKSC350187,LW,0,0,1,1,0,0,0,0,0
11948,UK,UKSC375884,LW,0,0,1,1,0,0,0,0,0


Merging Data

In [101]:
# So Now We Merge both The EQR dataset and The Derived dataset together
# Using Monitoringsite, countrycode and parameterwaterbodycategory for connecting them together

merged = pd.merge(derived_data,data,on=["monitoringSiteIdentifier","parameterWaterBodyCategory","CountryCode"])
merged.shape

In [102]:
# Drop rows that has duplicated id
merged.drop_duplicates(["UID"],inplace=True)

In [103]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28805 entries, 0 to 28811
Data columns (total 21 columns):
 #   Column                                                   Non-Null Count  Dtype  
---  ------                                                   --------------  -----  
 0   CountryCode                                              28805 non-null  object 
 1   monitoringSiteIdentifier                                 28805 non-null  object 
 2   parameterWaterBodyCategory                               28805 non-null  object 
 3   naturalAWBHMWB_AWB                                       28805 non-null  uint8  
 4   naturalAWBHMWB_HMWB                                      28805 non-null  uint8  
 5   naturalAWBHMWB_Natural                                   28805 non-null  uint8  
 6   reservoir_Missing                                        28805 non-null  uint8  
 7   reservoir_Not a reservoir                                28805 non-null  uint8  
 8   reservoir_Reservoir in for

In [104]:
merged.to_csv("merged.csv",index=False)