In [1]:
from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

In [2]:
# Load the data
data = pd.read_csv('train.csv')

In [3]:
data.columns[164:170]

Index(['X4_mean', 'X11_mean', 'X18_mean', 'X26_mean', 'X50_mean',
       'X3112_mean'],
      dtype='object')

In [4]:
y = data[data.columns[164:170]]
X = data.drop(data.columns[164:170], axis=1)

In [5]:
# Features
worldclim = data.columns[1:7]
soil = data.columns[7:68]
modis = data.columns[68:128]
vod = data.columns[128:164]

Adatelőkészítést végzek, logn-t alkalmazok a targeten és a hatványeloszlású featuren, standardizálom a többi featuret.

In [6]:
# Initialize StandardScaler
scaler = StandardScaler()

In [7]:
# Columns to standardize
columns_to_standardize = pd.concat([X[worldclim], X[soil], X[modis], X[vod]], axis=1).columns

In [8]:
# Standardize the selected columns
X[columns_to_standardize] = scaler.fit_transform(X[columns_to_standardize])

In [12]:
# Min value of X[modis]
epsilon = -X[modis].min().min() + 1e-6

In [13]:
# Apply natural logarithm to the selected columns
X[modis] = X[modis].apply(lambda x: np.log(x + epsilon))

In [15]:
y = scaler.fit_transform(y)

In [16]:
# Split your data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Előbb megnézem, a teljes featurekészlettel milyen eredményeket érek el.

In [17]:
# GBM model for each feature set
n_estimators = 1000
max_depth = 6
learning_rate = 0.1
r_state = 42

gbm = MultiOutputRegressor(estimator=XGBRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate, random_state=r_state))
worldclim_gbm = MultiOutputRegressor(estimator=XGBRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate, random_state=r_state))
soil_gbm = MultiOutputRegressor(estimator=XGBRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate, random_state=r_state))
modis_gbm = MultiOutputRegressor(estimator=XGBRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate, random_state=r_state))
vod_gbm = MultiOutputRegressor(estimator=XGBRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate, random_state=r_state))

In [18]:
gbm.fit(X_train, y_train)

In [19]:
pred_np = gbm.predict(X_test)

In [20]:
r2_score(y_test, pred_np)

-0.046782875504263154

In [21]:
pred_df = pd.DataFrame(pred_np, columns=data.columns[164:170])
y_df = pd.DataFrame(y_test, columns=data.columns[164:170])

In [22]:
for column in data.columns[164:170]:
    print(f"{column}: r2:{r2_score(y_df[column], pred_df[column])} mae:{mean_absolute_error(y_df[column], pred_df[column])}")

X4_mean: r2:0.3604752728221211 mae:0.5035948838822056
X11_mean: r2:-0.015139801548811338 mae:0.014728073132662983
X18_mean: r2:0.16559743028992846 mae:0.02503340806938166
X26_mean: r2:0.042630240250361995 mae:0.021760748248877203
X50_mean: r2:-0.8341548590188652 mae:0.02504944026363116
X3112_mean: r2:-0.00010553582033101527 mae:0.021371089729391418


Mivel az r2_score elég gyenge, ezért azt a következtetést vonom le, hogy a tabuláris adatok önmagukban nem segítik a modellt a döntésben. Nem szedem szét külön modellekre.

A gyorsabb tanulás érdekében a featurekészleten szűrést alkalmazok. Az adatelőkészítés során megfigyelhettünk korreláló csoportokat, ezek alapján keresem azt a legkissebb csoportot, aminek a legkevésbé csökken a variánciája.

In [23]:
from sklearn.feature_selection import SelectKBest, SelectPercentile, f_regression

In [39]:
y = pd.DataFrame(y, columns=data.columns[164:170])

In [79]:
worldclim_new = SelectKBest(f_regression, k=4).fit_transform(X[worldclim], y['X4_mean'])
soil_new = SelectKBest(f_regression, k=30).fit_transform(X[soil], y['X4_mean'])
modis_new = SelectKBest(f_regression, k=30).fit_transform(X[modis], y['X4_mean'])
vod_new = SelectKBest(f_regression, k=18).fit_transform(X[vod], y['X4_mean'])

In [86]:
X_new = pd.concat([pd.DataFrame(worldclim_new,columns=range(4)),
                   pd.DataFrame(soil_new,columns=range(4,34)),
                   pd.DataFrame(modis_new,columns=range(34,64)), 
                   pd.DataFrame(vod_new,columns=range(64,82)),],
                  axis=1)

In [87]:
# Split your data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=42)

In [None]:
gbm.fit(X_train, y_train)

In [None]:
pred_np = gbm.predict(X_test)

In [None]:
pred_df = pd.DataFrame(pred_np, columns=data.columns[164:170])
y_df = pd.DataFrame(y_test, columns=data.columns[164:170])

In [None]:
for column in data.columns[164:170]:
    print(f"{column}: r2:{r2_score(y_df[column], pred_df[column])} mae:{mean_absolute_error(y_df[column], pred_df[column])}")