### Import Packages

In [1]:
import numpy as np
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, cross_val_score

pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 100)

### Read Dataset

In [2]:
species = 'Sardina pilchardus'
speciesName = 'Sardina Pilchardus'
# panel data
data_panel = pd.read_csv('T:/ownCloud/ODYSSEA/Aquamaps/the16species_final/'+species+'.csv',index_col='Unnamed: 0')
# fill missing values with interpolation
data_panel = data_panel.interpolate()
print(data_panel.shape[0], 'rows')

71520 rows


In [3]:
data_panel.head()

Unnamed: 0,id,Genus,Species,Center Lat,Center Long,C-Square Code,Overall Probability,obs_id,temperatureSurface,year_month,temperature100_300,temperature300_400,temperature100_500,temperatureMaxDepth,salinitySurface,salinity100_300,salinity300_400,salinity100_500,salinityMaxDepth,dissolvedOxygenSurface,dissolvedOxygen100_300,dissolvedOxygen300_400,dissolvedOxygen100_500,dissolvedOxygenMaxDepth,meridionalCurrentSurface,meridionalCurrent100_300,meridionalCurrent300_400,meridionalCurrent100_500,meridionalCurrentMaxDepth,zonalCurrentSurface,zonalCurrent100_300,zonalCurrent300_400,zonalCurrent100_500,zonalCurrentMaxDepth,chlorophyll,euphoticDepth,secchiDiskDepth,wave_Height,nitrateSurface,nitrate100_300,nitrate300_400,nitrate100_500,nitrateMaxDepth,phosphateSurface,phosphate100_300,phosphate300_400,phosphate100_500,phosphateMaxDepth,distanceToCoast,majorRiverDistance,majorRiversScale,bathymetry,substrateType,substrateOrigHabitat,substrateBiozone
0,02008-01,Sardina,pilchardus,30.75,17.75,1301:207:4,0.12,0,17.487333,2008-01,17.483358,17.483358,17.483358,17.483358,38.58962,38.589687,38.589687,38.589687,38.589687,215.00977,215.00972,215.00972,215.00972,215.00972,-0.004633,-0.004417,-0.004417,-0.004417,-0.004417,0.00399,0.003799,0.003799,0.003799,0.003799,0.09112,-0.787063,16.542532,0.364236,0.006429,0.006428,0.006428,0.006428,0.006428,0.003623,0.003623,0.003623,0.003623,0.003623,18.893757,1105.000196,8,-16.010401,Sand,A5.46: Mediterranean biocoenosis of coastal de...,Circalittoral
1,02008-02,Sardina,pilchardus,30.75,17.75,1301:207:4,0.12,0,16.356483,2008-02,16.32341,16.32341,16.32341,16.32341,38.3575,38.3565,38.3565,38.3565,38.3565,222.06847,222.06845,222.06845,222.06845,222.06845,0.0,-0.004417,-0.004417,-0.004417,0.0,0.0,0.003799,0.003799,0.003799,0.0,0.058834,-0.595206,14.983622,0.371151,0.005919,0.005919,0.005919,0.005919,0.005919,0.003703,0.003703,0.003703,0.003703,0.003703,18.893757,1105.000196,8,-16.010401,Sand,A5.46: Mediterranean biocoenosis of coastal de...,Circalittoral
2,02008-03,Sardina,pilchardus,30.75,17.75,1301:207:4,0.12,0,17.412617,2008-03,17.323942,17.323942,17.323942,17.323942,38.330997,38.32942,38.32942,38.32942,38.32942,222.28883,222.28891,222.28891,222.28891,222.28891,-0.018918,-0.017599,-0.017599,-0.017599,-0.017599,0.014905,0.015852,0.015852,0.015852,0.015852,0.088339,-0.858202,17.27412,0.173578,0.006872,0.006872,0.006872,0.006872,0.006872,0.003937,0.003937,0.003937,0.003937,0.003937,18.893757,1105.000196,8,-16.010401,Sand,A5.46: Mediterranean biocoenosis of coastal de...,Circalittoral
3,02008-04,Sardina,pilchardus,30.75,17.75,1301:207:4,0.12,0,18.664886,2008-04,18.580479,18.580479,18.580479,18.580479,38.356586,38.35468,38.35468,38.35468,38.35468,217.55254,217.55266,217.55266,217.55266,217.55266,-0.015815,-0.014742,-0.014742,-0.014742,-0.014742,0.014309,0.012531,0.012531,0.012531,0.012531,0.219484,-0.786224,20.733776,0.777014,0.006976,0.006975,0.006975,0.006975,0.006975,0.004803,0.004803,0.004803,0.004803,0.004803,18.893757,1105.000196,8,-16.010401,Sand,A5.46: Mediterranean biocoenosis of coastal de...,Circalittoral
4,02008-05,Sardina,pilchardus,30.75,17.75,1301:207:4,0.12,0,20.683325,2008-05,20.466887,20.466887,20.466887,20.466887,38.41463,38.409603,38.409603,38.409603,38.409603,211.7808,212.11305,212.11305,212.11305,212.11305,-0.010033,-0.009926,-0.009926,-0.009926,-0.009926,0.009827,0.007829,0.007829,0.007829,0.007829,0.07954,-0.866522,21.826303,0.29616,0.007796,0.007209,0.007209,0.007209,0.007209,0.004959,0.004966,0.004966,0.004966,0.004966,18.893757,1105.000196,8,-16.010401,Sand,A5.46: Mediterranean biocoenosis of coastal de...,Circalittoral


In [4]:
### Create some lists that will be useful
static_cols = ['distanceToCoast','majorRiverDistance', 'majorRiversScale', 'bathymetry', 'substrateType','substrateOrigHabitat', 'substrateBiozone']
temporal_cols = ['temperatureSurface','temperature100_300', 'temperature300_400', 'temperature100_500','temperatureMaxDepth', 'salinitySurface', 'salinity100_300','salinity300_400', 'salinity100_500', 'salinityMaxDepth','dissolvedOxygenSurface', 'dissolvedOxygen100_300','dissolvedOxygen300_400', 'dissolvedOxygen100_500','dissolvedOxygenMaxDepth', 'meridionalCurrentSurface','meridionalCurrent100_300', 'meridionalCurrent300_400','meridionalCurrent100_500', 'meridionalCurrentMaxDepth',
                 'zonalCurrentSurface', 'zonalCurrent100_300', 'zonalCurrent300_400','zonalCurrent100_500', 'zonalCurrentMaxDepth', 'chlorophyll','euphoticDepth', 'secchiDiskDepth', 'wave_Height', 'nitrateSurface','nitrate100_300', 'nitrate300_400', 'nitrate100_500', 'nitrateMaxDepth','phosphateSurface', 'phosphate100_300', 'phosphate300_400','phosphate100_500', 'phosphateMaxDepth']

### Create a new dataset with the mean value of each observation. This is the mean of 120 months.
data = pd.DataFrame()
for c in data_panel.columns:
    if c in temporal_cols:
        data[c+'_mean'] = data_panel.groupby('obs_id')[c].mean()
        data[c+'_std'] = data_panel.groupby('obs_id')[c].std()
        data[c+'_min'] = data_panel.groupby('obs_id')[c].min()
        data[c+'_max'] = data_panel.groupby('obs_id')[c].max()
    if c in static_cols:
        data[c] = data_panel.groupby('obs_id')[c].agg(lambda x:x.value_counts().index[0])
    if c in ['Center Lat','Center Long','Overall Probability']:
        data[c] = data_panel.groupby('obs_id')[c].agg(lambda x:x.value_counts().index[0])

In [5]:
print(data.shape[0], 'rows and ', data.shape[1], 'columns')
data.head()

596 rows and  166 columns


Unnamed: 0_level_0,Center Lat,Center Long,Overall Probability,temperatureSurface_mean,temperatureSurface_std,temperatureSurface_min,temperatureSurface_max,temperature100_300_mean,temperature100_300_std,temperature100_300_min,temperature100_300_max,temperature300_400_mean,temperature300_400_std,temperature300_400_min,temperature300_400_max,temperature100_500_mean,temperature100_500_std,temperature100_500_min,temperature100_500_max,temperatureMaxDepth_mean,temperatureMaxDepth_std,temperatureMaxDepth_min,temperatureMaxDepth_max,salinitySurface_mean,salinitySurface_std,salinitySurface_min,salinitySurface_max,salinity100_300_mean,salinity100_300_std,salinity100_300_min,salinity100_300_max,salinity300_400_mean,salinity300_400_std,salinity300_400_min,salinity300_400_max,salinity100_500_mean,salinity100_500_std,salinity100_500_min,salinity100_500_max,salinityMaxDepth_mean,salinityMaxDepth_std,salinityMaxDepth_min,salinityMaxDepth_max,dissolvedOxygenSurface_mean,dissolvedOxygenSurface_std,dissolvedOxygenSurface_min,dissolvedOxygenSurface_max,dissolvedOxygen100_300_mean,dissolvedOxygen100_300_std,dissolvedOxygen100_300_min,dissolvedOxygen100_300_max,dissolvedOxygen300_400_mean,dissolvedOxygen300_400_std,dissolvedOxygen300_400_min,dissolvedOxygen300_400_max,dissolvedOxygen100_500_mean,dissolvedOxygen100_500_std,dissolvedOxygen100_500_min,dissolvedOxygen100_500_max,dissolvedOxygenMaxDepth_mean,dissolvedOxygenMaxDepth_std,dissolvedOxygenMaxDepth_min,dissolvedOxygenMaxDepth_max,meridionalCurrentSurface_mean,meridionalCurrentSurface_std,meridionalCurrentSurface_min,meridionalCurrentSurface_max,meridionalCurrent100_300_mean,meridionalCurrent100_300_std,meridionalCurrent100_300_min,meridionalCurrent100_300_max,meridionalCurrent300_400_mean,meridionalCurrent300_400_std,meridionalCurrent300_400_min,meridionalCurrent300_400_max,meridionalCurrent100_500_mean,meridionalCurrent100_500_std,meridionalCurrent100_500_min,meridionalCurrent100_500_max,meridionalCurrentMaxDepth_mean,meridionalCurrentMaxDepth_std,meridionalCurrentMaxDepth_min,meridionalCurrentMaxDepth_max,zonalCurrentSurface_mean,zonalCurrentSurface_std,zonalCurrentSurface_min,zonalCurrentSurface_max,zonalCurrent100_300_mean,zonalCurrent100_300_std,zonalCurrent100_300_min,zonalCurrent100_300_max,zonalCurrent300_400_mean,zonalCurrent300_400_std,zonalCurrent300_400_min,zonalCurrent300_400_max,zonalCurrent100_500_mean,zonalCurrent100_500_std,zonalCurrent100_500_min,zonalCurrent100_500_max,zonalCurrentMaxDepth_mean,zonalCurrentMaxDepth_std,zonalCurrentMaxDepth_min,zonalCurrentMaxDepth_max,chlorophyll_mean,chlorophyll_std,chlorophyll_min,chlorophyll_max,euphoticDepth_mean,euphoticDepth_std,euphoticDepth_min,euphoticDepth_max,secchiDiskDepth_mean,secchiDiskDepth_std,secchiDiskDepth_min,secchiDiskDepth_max,wave_Height_mean,wave_Height_std,wave_Height_min,wave_Height_max,nitrateSurface_mean,nitrateSurface_std,nitrateSurface_min,nitrateSurface_max,nitrate100_300_mean,nitrate100_300_std,nitrate100_300_min,nitrate100_300_max,nitrate300_400_mean,nitrate300_400_std,nitrate300_400_min,nitrate300_400_max,nitrate100_500_mean,nitrate100_500_std,nitrate100_500_min,nitrate100_500_max,nitrateMaxDepth_mean,nitrateMaxDepth_std,nitrateMaxDepth_min,nitrateMaxDepth_max,phosphateSurface_mean,phosphateSurface_std,phosphateSurface_min,phosphateSurface_max,phosphate100_300_mean,phosphate100_300_std,phosphate100_300_min,phosphate100_300_max,phosphate300_400_mean,phosphate300_400_std,phosphate300_400_min,phosphate300_400_max,phosphate100_500_mean,phosphate100_500_std,phosphate100_500_min,phosphate100_500_max,phosphateMaxDepth_mean,phosphateMaxDepth_std,phosphateMaxDepth_min,phosphateMaxDepth_max,distanceToCoast,majorRiverDistance,majorRiversScale,bathymetry,substrateType,substrateOrigHabitat,substrateBiozone
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1
0,30.75,17.75,0.12,22.37363,4.222837,16.102207,29.556072,22.287897,4.202942,16.087679,29.44944,22.287897,4.202942,16.087679,29.44944,22.287897,4.202942,16.087679,29.44944,22.287897,4.202942,16.087679,29.44944,38.681165,0.193579,38.211845,39.05769,38.677997,0.193295,38.20566,39.05649,38.677997,0.193295,38.20566,39.05649,38.677997,0.193295,38.20566,39.05649,38.677997,0.193295,38.20566,39.05649,203.783999,13.415763,180.52182,224.3134,203.82899,13.414985,180.52187,224.31345,203.82899,13.414985,180.52187,224.31345,203.82899,13.414985,180.52187,224.31345,203.82899,13.414985,180.52187,224.31345,-0.007706,0.012578,-0.044677,0.023481,-0.004464,0.012915,-0.044415,0.026496,-0.004464,0.012915,-0.044415,0.026496,-0.004464,0.012915,-0.044415,0.026496,-0.00379,0.012536,-0.044415,0.026496,0.001265,0.013089,-0.037671,0.038564,0.006029,0.009985,-0.017658,0.037975,0.006029,0.009985,-0.017658,0.037975,0.006029,0.009985,-0.017658,0.037975,0.005456,0.00976,-0.017658,0.037975,0.076478,0.05448,0.01169,0.464525,-0.846714,0.177943,-1.320434,-0.271922,17.455139,4.634243,10.067203,36.67319,0.75474,0.467376,0.173578,3.310315,0.008315,0.002041,0.005082,0.014681,0.008181,0.001966,0.005043,0.01468,0.008181,0.001966,0.005043,0.01468,0.008181,0.001966,0.005043,0.01468,0.008181,0.001966,0.005043,0.01468,0.004724,0.000988,0.002674,0.007043,0.004721,0.000988,0.002674,0.007116,0.004721,0.000988,0.002674,0.007116,0.004721,0.000988,0.002674,0.007116,0.004721,0.000988,0.002674,0.007116,18.893757,1105.000196,8,-16.010401,Sand,A5.46: Mediterranean biocoenosis of coastal de...,Circalittoral
1,30.25,18.75,0.08,22.510767,4.073965,16.388252,29.284786,22.427538,4.06259,16.36903,29.169163,22.427538,4.06259,16.36903,29.169163,22.427538,4.06259,16.36903,29.169163,22.427538,4.06259,16.36903,29.169163,38.713629,0.175768,38.290092,39.071777,38.710803,0.175711,38.289948,39.069096,38.710803,0.175711,38.289948,39.069096,38.710803,0.175711,38.289948,39.069096,38.710803,0.175711,38.289948,39.069096,203.30919,13.489414,180.72363,225.01564,203.323303,13.484285,180.72366,225.01585,203.323303,13.484285,180.72366,225.01585,203.323303,13.484285,180.72366,225.01585,203.323303,13.484285,180.72366,225.01585,-0.012922,0.014121,-0.066863,0.019405,-0.006547,0.012257,-0.046272,0.016147,-0.006547,0.012257,-0.046272,0.016147,-0.006547,0.012257,-0.046272,0.016147,-0.00582,0.011904,-0.046272,0.016147,0.004533,0.011637,-0.021759,0.040527,0.009059,0.009949,-0.013653,0.039754,0.009059,0.009949,-0.013653,0.039754,0.009059,0.009949,-0.013653,0.039754,0.008436,0.009847,-0.013653,0.039754,0.091057,0.077141,0.013951,0.692312,-0.772313,0.165006,-1.105393,-0.295748,26.143833,3.560518,17.737255,33.299313,0.682463,0.390534,0.268236,2.882378,0.007701,0.002221,0.004914,0.023072,0.013837,0.069619,0.004913,0.7699,0.013837,0.069619,0.004913,0.7699,0.013837,0.069619,0.004913,0.7699,0.013837,0.069619,0.004913,0.7699,0.004749,0.000837,0.00296,0.006656,0.005081,0.003819,0.00296,0.045562,0.005081,0.003819,0.00296,0.045562,0.005081,0.003819,0.00296,0.045562,0.005081,0.003819,0.00296,0.045562,12.91371,1140.79994,8,-5.39839,Sand,A5.46: Mediterranean biocoenosis of coastal de...,Circalittoral
2,30.75,18.75,0.09,22.382202,3.693336,16.880266,28.372719,15.951561,0.604374,14.873776,18.379402,15.951561,0.604374,14.873776,18.379402,15.951561,0.604374,14.873776,18.379402,15.951561,0.604374,14.873776,18.379402,38.591174,0.138594,38.254765,38.919537,38.666155,0.072253,38.47463,38.84058,38.666155,0.072253,38.47463,38.84058,38.666155,0.072253,38.47463,38.84058,38.666155,0.072253,38.47463,38.84058,201.188264,11.200119,183.4743,219.88947,205.986379,2.339237,201.1537,212.77328,205.986379,2.339237,201.1537,212.77328,205.986379,2.339237,201.1537,212.77328,205.986379,2.339237,201.1537,212.77328,-0.007238,0.041032,-0.119412,0.09181,-0.000512,0.00306,-0.010765,0.007801,-0.000512,0.00306,-0.010765,0.007801,-0.000512,0.00306,-0.010765,0.007801,-0.00047,0.002908,-0.010765,0.007801,-0.040264,0.057624,-0.179169,0.130611,-0.004074,0.010641,-0.020803,0.036925,-0.004074,0.010641,-0.020803,0.036925,-0.004074,0.010641,-0.020803,0.036925,-0.004464,0.010036,-0.020803,0.036925,0.058451,0.027239,0.031163,0.124088,-0.962889,0.189569,-1.251459,-0.551882,29.6389,5.199037,20.750984,38.36594,1.006145,0.64356,0.251121,4.276607,0.014393,0.012767,0.003338,0.074606,1.55624,0.539029,0.319594,2.559051,1.55624,0.539029,0.319594,2.559051,1.55624,0.539029,0.319594,2.559051,1.55624,0.539029,0.319594,2.559051,0.006078,0.001003,0.004259,0.008713,0.093676,0.027353,0.024012,0.131216,0.093676,0.027353,0.024012,0.131216,0.093676,0.027353,0.024012,0.131216,0.093676,0.027353,0.024012,0.131216,31.365613,1085.905121,8,-71.006104,Muddy sand,A5.23: Infralittoral fine sands,Infralittoral
3,30.25,19.25,0.14,22.485016,4.05029,16.414967,29.233171,22.335924,4.02866,16.391079,29.110983,22.335924,4.02866,16.391079,29.110983,22.335924,4.02866,16.391079,29.110983,22.335924,4.02866,16.391079,29.110983,38.73844,0.17409,38.314053,39.092037,38.731926,0.175088,38.31379,39.090054,38.731926,0.175088,38.31379,39.090054,38.731926,0.175088,38.31379,39.090054,38.731926,0.175088,38.31379,39.090054,200.799729,13.39906,180.32677,224.0787,200.990596,13.440554,180.32687,224.07881,200.990596,13.440554,180.32687,224.07881,200.990596,13.440554,180.32687,224.07881,200.990596,13.440554,180.32687,224.07881,-0.004965,0.011653,-0.035522,0.025042,0.00794,0.008241,-0.012016,0.038771,0.00794,0.008241,-0.012016,0.038771,0.00794,0.008241,-0.012016,0.038771,0.007571,0.008278,-0.012016,0.038771,0.003256,0.023219,-0.042576,0.080796,0.007635,0.011977,-0.01574,0.047922,0.007635,0.011977,-0.01574,0.047922,0.007635,0.011977,-0.01574,0.047922,0.006999,0.011681,-0.01574,0.047922,0.11253,0.061684,0.023472,0.346663,-0.581591,0.14856,-0.876313,-0.109884,28.995374,4.572782,21.569519,37.56267,0.805212,0.526814,0.22753,3.27776,0.006719,0.001758,0.004136,0.011262,0.006272,0.001526,0.002682,0.011261,0.006272,0.001526,0.002682,0.011261,0.006272,0.001526,0.002682,0.011261,0.006272,0.001526,0.002682,0.011261,0.004522,0.000833,0.002802,0.00648,0.004516,0.000834,0.002802,0.00648,0.004516,0.000834,0.002802,0.00648,0.004516,0.000834,0.002802,0.00648,0.004516,0.000834,0.002802,0.00648,3.049505,1103.186667,7,-7.01648,Sand,A5.46: Mediterranean biocoenosis of coastal de...,Circalittoral
4,30.25,19.75,0.19,22.514297,3.894756,16.771269,28.98427,22.218005,3.824653,16.655327,28.7581,22.218005,3.824653,16.655327,28.7581,22.218005,3.824653,16.655327,28.7581,22.218005,3.824653,16.655327,28.7581,38.715601,0.162548,38.331936,39.04503,38.699159,0.16332,38.330425,39.03519,38.699159,0.16332,38.330425,39.03519,38.699159,0.16332,38.330425,39.03519,38.699159,0.16332,38.330425,39.03519,200.714299,13.145771,180.49303,222.29796,201.763006,13.095796,180.49304,222.29684,201.763006,13.095796,180.49304,222.29684,201.763006,13.095796,180.49304,222.29684,201.763006,13.095796,180.49304,222.29684,-0.009709,0.016862,-0.060319,0.036464,0.006801,0.009626,-0.013287,0.037051,0.006801,0.009626,-0.013287,0.037051,0.006801,0.009626,-0.013287,0.037051,0.006443,0.009483,-0.013287,0.037051,-0.000942,0.011394,-0.04032,0.031246,0.004769,0.007648,-0.012088,0.030772,0.004769,0.007648,-0.012088,0.030772,0.004769,0.007648,-0.012088,0.030772,0.004482,0.007474,-0.012088,0.030772,0.162442,0.151359,0.024053,1.339796,-0.738898,0.119151,-0.988317,-0.25283,18.330527,2.179457,12.465387,23.132456,0.813227,0.610601,0.185879,4.013224,0.008634,0.006637,0.00215,0.045462,0.006595,0.006482,0.001454,0.045459,0.006595,0.006482,0.001454,0.045459,0.006595,0.006482,0.001454,0.045459,0.006595,0.006482,0.001454,0.045459,0.004746,0.000834,0.002812,0.006418,0.004655,0.000799,0.002812,0.00684,0.004655,0.000799,0.002812,0.00684,0.004655,0.000799,0.002812,0.00684,0.004655,0.000799,0.002812,0.00684,22.506086,1055.413182,7,-6.41378,Muddy sand,A5.23: Infralittoral fine sands,Infralittoral


### Encode Categorical Features

In [6]:
for c in data:
    if (data[c].dtype=='object'):
        lbl = LabelEncoder() 
        lbl.fit(list(data[c].values))
        data[c] = lbl.transform(list(data[c].values))

## Train Models

In [7]:
ntrain = data.shape[0]
SEED = 2019 # for reproducibility
NFOLDS = 5
# Define Cross Validation
folds = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)

# Define evaluation function (Root Mean Square Error)
def cv_rmse(model, X, y):
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=folds))
    return (rmse)

In [8]:
cols_to_exclude = ['obs_id','Overall Probability']
df_train_columns = [c for c in data.columns if c not in cols_to_exclude]

y_train = data['Overall Probability'].ravel() #ravel coverts a series to a numpy array
x_train = data[df_train_columns].values # converts a dataframe to a numpy array

### LightGBM

In [9]:
lightgbm = LGBMRegressor(objective='regression', 
                                       num_leaves=4,
                                       learning_rate=0.01, 
                                       n_estimators=5000,
                                       max_bin=200, 
                                       bagging_fraction=0.75,
                                       bagging_freq=5, 
                                       bagging_seed=7,
                                       feature_fraction=0.2,
                                       feature_fraction_seed=7,
                                       verbose=-1,
                                       )

score = cv_rmse(lightgbm, x_train, y_train)
print("lightgbm: {:.4f} ({:.4f})".format(score.mean(), score.std()) )

lightgbm: 0.1377 (0.0206)
