In [1]:
from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

In [2]:
# Load the data
data = pd.read_csv('train.csv')

In [3]:
data.columns[164:170]

Index(['X4_mean', 'X11_mean', 'X18_mean', 'X26_mean', 'X50_mean',
       'X3112_mean'],
      dtype='object')

In [4]:
y = data[data.columns[164:170]]
X = data.drop(data.columns[164:170], axis=1)

In [5]:
# Features
worldclim = data.columns[0:7]
soil = data.columns[7:68]
modis = data.columns[68:128]
vod = data.columns[128:164]

Adatelőkészítést végzek, logn-t alkalmazok a targeten és a hatványeloszlású featuren, standardizálom a többi featuret.

In [6]:
# Initialize StandardScaler
scaler = StandardScaler()

In [7]:
# Columns to standardize
columns_to_standardize = pd.concat([X[worldclim], X[soil], X[modis], X[vod]], axis=1).columns

In [8]:
# Standardize the selected columns
X[columns_to_standardize] = scaler.fit_transform(X[columns_to_standardize])

In [12]:
# Min value of X[modis]
epsilon = -X[modis].min().min() + 1e-6

In [13]:
# Apply natural logarithm to the selected columns
X[modis] = X[modis].apply(lambda x: np.log(x + epsilon))

In [15]:
y = scaler.fit_transform(y)

In [16]:
# Split your data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Előbb megnézem, a teljes featurekészlettel milyen eredményeket érek el.

In [17]:
# GBM model for each feature set
n_estimators = 1000
max_depth = 6
learning_rate = 0.1
r_state = 42

gbm = MultiOutputRegressor(estimator=XGBRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate, random_state=r_state))
worldclim_gbm = MultiOutputRegressor(estimator=XGBRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate, random_state=r_state))
soil_gbm = MultiOutputRegressor(estimator=XGBRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate, random_state=r_state))
modis_gbm = MultiOutputRegressor(estimator=XGBRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate, random_state=r_state))
vod_gbm = MultiOutputRegressor(estimator=XGBRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate, random_state=r_state))

In [18]:
gbm.fit(X_train, y_train)

In [19]:
pred_np = gbm.predict(X_test)

In [20]:
r2_score(y_test, pred_np)

-0.046782875504263154

In [21]:
pred_df = pd.DataFrame(pred_np, columns=data.columns[164:170])
y_df = pd.DataFrame(y_test, columns=data.columns[164:170])

In [22]:
for column in data.columns[164:170]:
    print(f"{column}: r2:{r2_score(y_df[column], pred_df[column])} mae:{mean_absolute_error(y_df[column], pred_df[column])}")

X4_mean: r2:0.3604752728221211 mae:0.5035948838822056
X11_mean: r2:-0.015139801548811338 mae:0.014728073132662983
X18_mean: r2:0.16559743028992846 mae:0.02503340806938166
X26_mean: r2:0.042630240250361995 mae:0.021760748248877203
X50_mean: r2:-0.8341548590188652 mae:0.02504944026363116
X3112_mean: r2:-0.00010553582033101527 mae:0.021371089729391418


Mivel az r2_score elég gyenge, ezért azt a következtetést vonom le, hogy a tabuláris adatok önmagukban nem segítik a modellt a döntésben. Nem szedem szét külön modellekre.

A gyorsabb tanulás érdekében a featurekészleten szűrést alkalmazok. Az adatelőkészítés során megfigyelhettünk korreláló csoportokat, ezek alapján keresem azt a legkissebb csoportot, aminek a legkevésbé csökken a variánciája.

In [23]:
from sklearn.feature_selection import SelectKBest, f_regression

In [39]:
y = pd.DataFrame(y, columns=data.columns[164:170])

In [57]:
worldclim_new = SelectKBest(f_regression, k=7).fit_transform(X[worldclim], y['X4_mean'])
soil_new = SelectKBest(f_regression, k=61).fit_transform(X[soil], y['X4_mean'])
modis_new = SelectKBest(f_regression, k=60).fit_transform(X[modis], y['X4_mean'])
vod_new = SelectKBest(f_regression, k=36).fit_transform(X[vod], y['X4_mean'])

In [58]:
X_new = pd.concat([pd.DataFrame(worldclim_new,columns=range(7)),
                   pd.DataFrame(soil_new,columns=range(7,68)),
                   pd.DataFrame(modis_new,columns=range(68,128)), 
                   pd.DataFrame(vod_new,columns=range(128,164))],
                  axis=1)

In [64]:
X_new.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,154,155,156,157,158,159,160,161,162,163
0,0.503259,-0.308725,-0.889802,-0.554512,0.490021,0.805016,0.879735,0.460272,0.579818,0.487341,...,0.638743,0.927605,1.040217,0.817117,0.347126,0.213268,0.384477,0.644727,0.544614,0.236286
1,0.589235,0.355134,-1.259414,-0.936992,-0.583946,1.075379,1.61538,0.403478,0.248144,0.612104,...,-0.768723,-0.881663,-0.942067,-1.069347,-1.0222,-1.020704,-0.940234,-0.787698,-0.648572,-0.492379
2,0.616069,-0.042541,-0.203696,-0.648946,-1.225982,-0.41291,-0.264452,-0.562008,-0.481537,-0.57314,...,0.488973,0.413616,0.405373,0.406568,0.344944,0.294744,0.386658,0.516051,0.623689,0.651859
3,0.593799,0.541083,0.539961,0.182802,-0.369732,-0.434778,-0.523488,0.176305,-0.614206,-0.198852,...,-0.321266,-0.438934,-0.559986,-0.635205,-0.534976,-0.460638,-0.527141,-0.260902,-0.219068,-0.193552
4,0.582124,-1.833272,-0.687428,-0.64004,-0.583637,2.541933,2.258165,-1.470702,0.380814,-0.510759,...,-0.128926,0.260018,0.553307,1.199778,1.516921,1.560907,1.258588,0.767458,0.616531,0.594171


In [65]:
X.head()

Unnamed: 0,id,WORLDCLIM_BIO1_annual_mean_temperature,WORLDCLIM_BIO12_annual_precipitation,WORLDCLIM_BIO13.BIO14_delta_precipitation_of_wettest_and_dryest_month,WORLDCLIM_BIO15_precipitation_seasonality,WORLDCLIM_BIO4_temperature_seasonality,WORLDCLIM_BIO7_temperature_annual_range,SOIL_bdod_0.5cm_mean_0.01_deg,SOIL_bdod_100.200cm_mean_0.01_deg,SOIL_bdod_15.30cm_mean_0.01_deg,...,VOD_X_1997_2018_multiyear_mean_m09,VOD_X_1997_2018_multiyear_mean_m10,VOD_X_1997_2018_multiyear_mean_m11,VOD_X_1997_2018_multiyear_mean_m12,X4_sd,X11_sd,X18_sd,X26_sd,X50_sd,X3112_sd
0,0.503259,-0.308725,-0.889802,-0.554512,0.490021,0.805016,0.879735,0.460272,0.579818,0.487341,...,0.384477,0.644727,0.544614,0.236286,0.008921,1.601473,0.025441,0.153608,0.27961,15.045054
1,0.589235,0.355134,-1.259414,-0.936992,-0.583946,1.075379,1.61538,0.403478,0.248144,0.612104,...,-0.940234,-0.787698,-0.648572,-0.492379,0.003102,0.258078,0.000866,0.03463,0.010165,11.004477
2,0.616069,-0.042541,-0.203696,-0.648946,-1.225982,-0.41291,-0.264452,-0.562008,-0.481537,-0.57314,...,0.386658,0.516051,0.623689,0.651859,,,,,,
3,0.593799,0.541083,0.539961,0.182802,-0.369732,-0.434778,-0.523488,0.176305,-0.614206,-0.198852,...,-0.527141,-0.260902,-0.219068,-0.193552,0.011692,2.818356,0.110673,0.011334,0.229224,141.857187
4,0.582124,-1.833272,-0.687428,-0.64004,-0.583637,2.541933,2.258165,-1.470702,0.380814,-0.510759,...,1.258588,0.767458,0.616531,0.594171,0.006157,1.128,0.026996,0.553815,0.107092,87.146899


In [59]:
# Split your data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=42)

In [60]:
gbm.fit(X_train, y_train)

In [61]:
pred_np = gbm.predict(X_test)

In [62]:
pred_df = pd.DataFrame(pred_np, columns=data.columns[164:170])
y_df = pd.DataFrame(y_test, columns=data.columns[164:170])

In [63]:
for column in data.columns[164:170]:
    print(f"{column}: r2:{r2_score(y_df[column], pred_df[column])} mae:{mean_absolute_error(y_df[column], pred_df[column])}")

X4_mean: r2:0.13488430885642344 mae:0.6226767094669617
X11_mean: r2:-0.011394051942718253 mae:0.014277973284062623
X18_mean: r2:-0.00018018278313824965 mae:0.021320166292782237
X26_mean: r2:0.04074184888080401 mae:0.028588588043582403
X50_mean: r2:-0.8187153988407851 mae:0.02488598013741797
X3112_mean: r2:-0.00010571818082438789 mae:0.021377252608233657
