### Import Packages

In [1]:
import numpy as np
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, cross_val_score

pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 100)

### Read Dataset

In [2]:
species = 'Sardina pilchardus'
speciesName = 'Sardina Pilchardus'
# panel data
data_panel = pd.read_csv('T:/ownCloud/ODYSSEA/Aquamaps/the16species_final/'+species+'.csv',index_col='Unnamed: 0')
cols_to_drop = [c for c in data_panel.columns if '300_400' in c or '100_500' in c]
data_panel = data_panel.drop(columns=cols_to_drop)
# fill missing values with interpolation
data_panel = data_panel.interpolate()
# create year and month columns
data_panel['year_month'] = pd.to_datetime(data_panel['year_month'])
data_panel['year'] = data_panel['year_month'].dt.year
data_panel['month'] = data_panel['year_month'].dt.month
print(data_panel.shape[0], 'rows')

71520 rows


In [3]:
data_panel.head()

Unnamed: 0,id,Genus,Species,Center Lat,Center Long,C-Square Code,Overall Probability,obs_id,temperatureSurface,year_month,temperature100_300,temperatureMaxDepth,salinitySurface,salinity100_300,salinityMaxDepth,dissolvedOxygenSurface,dissolvedOxygen100_300,dissolvedOxygenMaxDepth,meridionalCurrentSurface,meridionalCurrent100_300,meridionalCurrentMaxDepth,zonalCurrentSurface,zonalCurrent100_300,zonalCurrentMaxDepth,chlorophyll,euphoticDepth,secchiDiskDepth,wave_Height,nitrateSurface,nitrate100_300,nitrateMaxDepth,phosphateSurface,phosphate100_300,phosphateMaxDepth,distanceToCoast,majorRiverDistance,majorRiversScale,bathymetry,substrateType,substrateOrigHabitat,substrateBiozone,year,month
0,02008-01,Sardina,pilchardus,30.75,17.75,1301:207:4,0.12,0,17.487333,2008-01-01,17.483358,17.483358,38.58962,38.589687,38.589687,215.00977,215.00972,215.00972,-0.004633,-0.004417,-0.004417,0.00399,0.003799,0.003799,0.09112,-0.787063,16.542532,0.364236,0.006429,0.006428,0.006428,0.003623,0.003623,0.003623,18.893757,1105.000196,8,-16.010401,Sand,A5.46: Mediterranean biocoenosis of coastal de...,Circalittoral,2008,1
1,02008-02,Sardina,pilchardus,30.75,17.75,1301:207:4,0.12,0,16.356483,2008-02-01,16.32341,16.32341,38.3575,38.3565,38.3565,222.06847,222.06845,222.06845,0.0,-0.004417,0.0,0.0,0.003799,0.0,0.058834,-0.595206,14.983622,0.371151,0.005919,0.005919,0.005919,0.003703,0.003703,0.003703,18.893757,1105.000196,8,-16.010401,Sand,A5.46: Mediterranean biocoenosis of coastal de...,Circalittoral,2008,2
2,02008-03,Sardina,pilchardus,30.75,17.75,1301:207:4,0.12,0,17.412617,2008-03-01,17.323942,17.323942,38.330997,38.32942,38.32942,222.28883,222.28891,222.28891,-0.018918,-0.017599,-0.017599,0.014905,0.015852,0.015852,0.088339,-0.858202,17.27412,0.173578,0.006872,0.006872,0.006872,0.003937,0.003937,0.003937,18.893757,1105.000196,8,-16.010401,Sand,A5.46: Mediterranean biocoenosis of coastal de...,Circalittoral,2008,3
3,02008-04,Sardina,pilchardus,30.75,17.75,1301:207:4,0.12,0,18.664886,2008-04-01,18.580479,18.580479,38.356586,38.35468,38.35468,217.55254,217.55266,217.55266,-0.015815,-0.014742,-0.014742,0.014309,0.012531,0.012531,0.219484,-0.786224,20.733776,0.777014,0.006976,0.006975,0.006975,0.004803,0.004803,0.004803,18.893757,1105.000196,8,-16.010401,Sand,A5.46: Mediterranean biocoenosis of coastal de...,Circalittoral,2008,4
4,02008-05,Sardina,pilchardus,30.75,17.75,1301:207:4,0.12,0,20.683325,2008-05-01,20.466887,20.466887,38.41463,38.409603,38.409603,211.7808,212.11305,212.11305,-0.010033,-0.009926,-0.009926,0.009827,0.007829,0.007829,0.07954,-0.866522,21.826303,0.29616,0.007796,0.007209,0.007209,0.004959,0.004966,0.004966,18.893757,1105.000196,8,-16.010401,Sand,A5.46: Mediterranean biocoenosis of coastal de...,Circalittoral,2008,5


In [4]:
### Create some lists that will be useful
static_cols = ['distanceToCoast','majorRiverDistance', 'majorRiversScale', 'bathymetry', 'substrateType','substrateOrigHabitat', 'substrateBiozone']
temporal_cols = ['temperatureSurface','temperature100_300', 'temperature300_400', 'temperature100_500','temperatureMaxDepth', 'salinitySurface', 'salinity100_300','salinity300_400', 'salinity100_500', 'salinityMaxDepth','dissolvedOxygenSurface', 'dissolvedOxygen100_300','dissolvedOxygen300_400', 'dissolvedOxygen100_500','dissolvedOxygenMaxDepth', 'meridionalCurrentSurface','meridionalCurrent100_300', 'meridionalCurrent300_400','meridionalCurrent100_500', 'meridionalCurrentMaxDepth',
                 'zonalCurrentSurface', 'zonalCurrent100_300', 'zonalCurrent300_400','zonalCurrent100_500', 'zonalCurrentMaxDepth', 'chlorophyll','euphoticDepth', 'secchiDiskDepth', 'wave_Height', 'nitrateSurface','nitrate100_300', 'nitrate300_400', 'nitrate100_500', 'nitrateMaxDepth','phosphateSurface', 'phosphate100_300', 'phosphate300_400','phosphate100_500', 'phosphateMaxDepth']

### Create a new dataset with the mean value of each observation. This is the mean of 120 months.
data = pd.DataFrame()
for c in data_panel.columns:
    if c in temporal_cols:
        data[c+'_mean'] = data_panel.groupby('obs_id')[c].mean()    
        for i in data.index.unique():
            data.loc[i, c + '_Moving_average_10_mean'] = data_panel.loc[data_panel['obs_id'] == i, c].rolling(window=10).mean().mean(skipna=True)
            data.loc[i, c + '_January_mean'] =  data_panel.loc[(data_panel['obs_id'] == i) & (data_panel['month'] == 1), c].mean()
    if c in static_cols:
        data[c] = data_panel.groupby('obs_id')[c].agg(lambda x:x.value_counts().index[0])
    if c in ['Center Lat','Center Long','Overall Probability']:
        data[c] = data_panel.groupby('obs_id')[c].agg(lambda x:x.value_counts().index[0])

In [5]:
print(data.shape[0], 'rows and ', data.shape[1], 'columns')
data.head()

596 rows and  85 columns


Unnamed: 0_level_0,Center Lat,Center Long,Overall Probability,temperatureSurface_mean,temperatureSurface_Moving_average_10_mean,temperatureSurface_January_mean,temperature100_300_mean,temperature100_300_Moving_average_10_mean,temperature100_300_January_mean,temperatureMaxDepth_mean,temperatureMaxDepth_Moving_average_10_mean,temperatureMaxDepth_January_mean,salinitySurface_mean,salinitySurface_Moving_average_10_mean,salinitySurface_January_mean,salinity100_300_mean,salinity100_300_Moving_average_10_mean,salinity100_300_January_mean,salinityMaxDepth_mean,salinityMaxDepth_Moving_average_10_mean,salinityMaxDepth_January_mean,dissolvedOxygenSurface_mean,dissolvedOxygenSurface_Moving_average_10_mean,dissolvedOxygenSurface_January_mean,dissolvedOxygen100_300_mean,dissolvedOxygen100_300_Moving_average_10_mean,dissolvedOxygen100_300_January_mean,dissolvedOxygenMaxDepth_mean,dissolvedOxygenMaxDepth_Moving_average_10_mean,dissolvedOxygenMaxDepth_January_mean,meridionalCurrentSurface_mean,meridionalCurrentSurface_Moving_average_10_mean,meridionalCurrentSurface_January_mean,meridionalCurrent100_300_mean,meridionalCurrent100_300_Moving_average_10_mean,meridionalCurrent100_300_January_mean,meridionalCurrentMaxDepth_mean,meridionalCurrentMaxDepth_Moving_average_10_mean,meridionalCurrentMaxDepth_January_mean,zonalCurrentSurface_mean,zonalCurrentSurface_Moving_average_10_mean,zonalCurrentSurface_January_mean,zonalCurrent100_300_mean,zonalCurrent100_300_Moving_average_10_mean,zonalCurrent100_300_January_mean,zonalCurrentMaxDepth_mean,zonalCurrentMaxDepth_Moving_average_10_mean,zonalCurrentMaxDepth_January_mean,chlorophyll_mean,chlorophyll_Moving_average_10_mean,chlorophyll_January_mean,euphoticDepth_mean,euphoticDepth_Moving_average_10_mean,euphoticDepth_January_mean,secchiDiskDepth_mean,secchiDiskDepth_Moving_average_10_mean,secchiDiskDepth_January_mean,wave_Height_mean,wave_Height_Moving_average_10_mean,wave_Height_January_mean,nitrateSurface_mean,nitrateSurface_Moving_average_10_mean,nitrateSurface_January_mean,nitrate100_300_mean,nitrate100_300_Moving_average_10_mean,nitrate100_300_January_mean,nitrateMaxDepth_mean,nitrateMaxDepth_Moving_average_10_mean,nitrateMaxDepth_January_mean,phosphateSurface_mean,phosphateSurface_Moving_average_10_mean,phosphateSurface_January_mean,phosphate100_300_mean,phosphate100_300_Moving_average_10_mean,phosphate100_300_January_mean,phosphateMaxDepth_mean,phosphateMaxDepth_Moving_average_10_mean,phosphateMaxDepth_January_mean,distanceToCoast,majorRiverDistance,majorRiversScale,bathymetry,substrateType,substrateOrigHabitat,substrateBiozone
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1
0,30.75,17.75,0.12,22.37363,22.411266,17.740097,22.287897,22.325667,17.732998,22.287897,22.325667,17.732998,38.681165,38.684635,38.577528,38.677997,38.681472,38.577256,38.677997,38.681472,38.577256,203.783999,203.697217,214.176299,203.82899,203.741969,214.17624,203.82899,203.741969,214.17624,-0.007706,-0.007201,-0.023694,-0.004464,-0.004094,-0.023308,-0.00379,-0.003397,-0.023308,0.001265,0.000939,0.020613,0.006029,0.005681,0.01991,0.005456,0.00509,0.01991,0.076478,0.074686,0.085604,-0.846714,-0.849499,-0.710584,17.455139,17.36163,14.625666,0.75474,0.769129,1.164137,0.008315,0.008353,0.005846,0.008181,0.00822,0.005846,0.008181,0.00822,0.005846,0.004724,0.004726,0.003617,0.004721,0.004723,0.003617,0.004721,0.004723,0.003617,18.893757,1105.000196,8,-16.010401,Sand,A5.46: Mediterranean biocoenosis of coastal de...,Circalittoral
1,30.25,18.75,0.08,22.510767,22.540962,18.12705,22.427538,22.457088,18.121875,22.427538,22.457088,18.121875,38.713629,38.710556,38.637363,38.710803,38.707709,38.637149,38.710803,38.707709,38.637149,203.30919,203.269015,213.080083,203.323303,203.282241,213.080042,203.323303,203.282241,213.080042,-0.012922,-0.012341,-0.024109,-0.006547,-0.006125,-0.023876,-0.00582,-0.005407,-0.023876,0.004533,0.004157,0.021016,0.009059,0.00865,0.020533,0.008436,0.008034,0.020533,0.091057,0.091446,0.075589,-0.772313,-0.778362,-0.691783,26.143833,26.151211,21.510722,0.682463,0.693969,0.946254,0.007701,0.007608,0.005532,0.013837,0.00819,0.005531,0.013837,0.00819,0.005531,0.004749,0.004753,0.003677,0.005081,0.004788,0.003677,0.005081,0.004788,0.003677,12.91371,1140.79994,8,-5.39839,Sand,A5.46: Mediterranean biocoenosis of coastal de...,Circalittoral
2,30.75,18.75,0.09,22.382202,22.401291,18.956089,15.951561,15.929595,16.635167,15.951561,15.929595,16.635167,38.591174,38.582902,38.560479,38.666155,38.66086,38.636089,38.666155,38.66086,38.636089,201.188264,201.172393,207.241542,205.986379,205.85281,205.851439,205.986379,205.85281,205.851439,-0.007238,-0.005633,-0.003409,-0.000512,-0.00054,-0.001742,-0.00047,-0.000486,-0.001742,-0.040264,-0.043377,0.005133,-0.004074,-0.004116,0.012226,-0.004464,-0.00441,0.012226,0.058451,0.058105,0.106297,-0.962889,-0.963957,-0.679721,29.6389,29.668415,22.534747,1.006145,1.020739,1.630632,0.014393,0.014053,0.031426,1.55624,1.585028,1.32963,1.55624,1.585028,1.32963,0.006078,0.006095,0.007121,0.093676,0.095159,0.07988,0.093676,0.095159,0.07988,31.365613,1085.905121,8,-71.006104,Muddy sand,A5.23: Infralittoral fine sands,Infralittoral
3,30.25,19.25,0.14,22.485016,22.512389,18.127626,22.335924,22.363548,18.11831,22.335924,22.363548,18.11831,38.73844,38.733393,38.680338,38.731926,38.726869,38.679962,38.731926,38.726869,38.679962,200.799729,200.718686,211.226974,200.990596,200.905528,211.226838,200.990596,200.905528,211.226838,-0.004965,-0.005072,0.012511,0.00794,0.007671,0.01371,0.007571,0.007306,0.01371,0.003256,0.002971,0.042002,0.007635,0.007315,0.024753,0.006999,0.006684,0.024753,0.11253,0.113487,0.084206,-0.581591,-0.581197,-0.596028,28.995374,29.041552,23.002189,0.805212,0.818713,1.210851,0.006719,0.006771,0.004673,0.006272,0.006329,0.004672,0.006272,0.006329,0.004672,0.004522,0.004548,0.003414,0.004516,0.004542,0.003414,0.004516,0.004542,0.003414,3.049505,1103.186667,7,-7.01648,Sand,A5.46: Mediterranean biocoenosis of coastal de...,Circalittoral
4,30.25,19.75,0.19,22.514297,22.536902,18.546879,22.218005,22.238859,18.539902,22.218005,22.238859,18.539902,38.715601,38.709445,38.668491,38.699159,38.692726,38.66814,38.699159,38.692726,38.66814,200.714299,200.677682,210.568886,201.763006,201.728635,210.568779,201.763006,201.728635,210.568779,-0.009709,-0.009991,0.015351,0.006801,0.006353,0.016179,0.006443,0.006024,0.016179,-0.000942,-0.001264,0.014673,0.004769,0.004397,0.013478,0.004482,0.004135,0.013478,0.162442,0.164042,0.108016,-0.738898,-0.738726,-0.707546,18.330527,18.281981,17.433463,0.813227,0.827129,1.363113,0.008634,0.008768,0.003639,0.006595,0.006708,0.003638,0.006595,0.006708,0.003638,0.004746,0.004769,0.003623,0.004655,0.004678,0.003623,0.004655,0.004678,0.003623,22.506086,1055.413182,7,-6.41378,Muddy sand,A5.23: Infralittoral fine sands,Infralittoral


### Encode Categorical Features

In [6]:
for c in data:
    if (data[c].dtype=='object'):
        lbl = LabelEncoder() 
        lbl.fit(list(data[c].values))
        data[c] = lbl.transform(list(data[c].values))

## Train Models

In [7]:
ntrain = data.shape[0]
SEED = 2019 # for reproducibility
NFOLDS = 5
# Define Cross Validation
folds = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)

# Define evaluation function (Root Mean Square Error)
def cv_rmse(model, X, y):
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=folds))
    return (rmse)

In [8]:
cols_to_exclude = ['obs_id','Overall Probability']
df_train_columns = [c for c in data.columns if c not in cols_to_exclude]

y_train = data['Overall Probability'].ravel() #ravel coverts a series to a numpy array
x_train = data[df_train_columns].values # converts a dataframe to a numpy array

### LightGBM

In [9]:
lightgbm = LGBMRegressor(objective='regression', 
                                       num_leaves=4,
                                       learning_rate=0.01, 
                                       n_estimators=5000,
                                       max_bin=200, 
                                       bagging_fraction=0.75,
                                       bagging_freq=5, 
                                       bagging_seed=7,
                                       feature_fraction=0.2,
                                       feature_fraction_seed=7,
                                       verbose=-1,
                                       )

score = cv_rmse(lightgbm, x_train, y_train)
print("lightgbm: {:.4f} ({:.4f})".format(score.mean(), score.std()) )

lightgbm: 0.1389 (0.0168)
