In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [2]:
train = pd.read_csv("/kaggle/input/playground-series-s4e5/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s4e5/test.csv")
df_original = pd.read_csv("/kaggle/input/flood-prediction-factors/flood.csv")

In [3]:
train = train.drop("id", axis = 1)
test = test.drop("id", axis = 1)

In [4]:
df_original.shape, train.shape

((50000, 21), (1117957, 21))

In [5]:
target = pd.concat([train, df_original])
target.shape

(1167957, 21)

In [6]:
X = target.drop("FloodProbability", axis = 1)
y = target.FloodProbability

In [7]:
def feature_engineering(df):
    df["sum"] = df.sum(axis = 1)
    df["mean"] = df.mean(axis = 1)
    df["std"] = df.std(axis = 1)
    df["median"] = df.median(axis = 1)

feature_engineering(X)
feature_engineering(test)

In [8]:
c_after_init_fe = list(X.columns)
c_after_init_fe

['MonsoonIntensity',
 'TopographyDrainage',
 'RiverManagement',
 'Deforestation',
 'Urbanization',
 'ClimateChange',
 'DamsQuality',
 'Siltation',
 'AgriculturalPractices',
 'Encroachments',
 'IneffectiveDisasterPreparedness',
 'DrainageSystems',
 'CoastalVulnerability',
 'Landslides',
 'Watersheds',
 'DeterioratingInfrastructure',
 'PopulationScore',
 'WetlandLoss',
 'InadequatePlanning',
 'PoliticalFactors',
 'sum',
 'mean',
 'std',
 'median']

## create mean of each feature and calculate variance

In [9]:
all_means_train = X.mean()
all_means_test = test.mean()

In [10]:
def fe_mean_var(df, columns, all_means):
    for column in columns:
        df[f"mean_var_{column}"] = df[column] - all_means[column]

In [11]:
fe_mean_var(X, c_after_init_fe, all_means_train)
fe_mean_var(test, c_after_init_fe, all_means_test)

In [12]:
X.head()

Unnamed: 0,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,Encroachments,...,mean_var_Watersheds,mean_var_DeterioratingInfrastructure,mean_var_PopulationScore,mean_var_WetlandLoss,mean_var_InadequatePlanning,mean_var_PoliticalFactors,mean_var_sum,mean_var_mean,mean_var_std,mean_var_median
0,5,8,5,8,6,4,4,3,3,4,...,0.068794,-0.928574,2.070021,0.046818,2.057111,-1.941209,-4.837194,-0.460685,-1.004922,-0.164075
1,6,7,4,4,8,8,3,5,4,6,...,-1.931206,0.071426,-1.929979,-1.953182,-0.942889,-1.941209,-4.837194,-0.460685,-0.95261,-0.164075
2,6,5,6,7,3,7,1,5,4,5,...,0.068794,1.071426,3.070021,-2.953182,-1.942889,-1.941209,0.162806,0.015505,0.018071,0.835925
3,3,4,6,5,4,8,4,7,6,8,...,-0.931206,-0.928574,1.070021,0.046818,2.057111,0.058791,5.162806,0.491696,1.003319,-0.164075
4,5,3,2,6,4,4,3,3,3,3,...,1.068794,-0.928574,-3.929979,-2.953182,-1.942889,0.058791,-26.837194,-2.555923,-5.458698,-1.164075


In [13]:
test.head()

Unnamed: 0,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,Encroachments,...,mean_var_Watersheds,mean_var_DeterioratingInfrastructure,mean_var_PopulationScore,mean_var_WetlandLoss,mean_var_InadequatePlanning,mean_var_PoliticalFactors,mean_var_sum,mean_var_mean,mean_var_std,mean_var_median
0,4,6,3,5,6,7,8,7,8,4,...,1.06928,-1.926062,1.073043,-0.948424,-0.940204,0.056082,12.201441,1.162042,2.421805,0.839577
1,4,4,2,9,5,5,4,7,5,4,...,0.06928,-3.926062,2.073043,-0.948424,-0.940204,-1.943918,-8.798559,-0.837958,-1.788559,-1.160423
2,1,3,6,5,7,2,4,6,4,2,...,0.06928,-2.926062,-1.926957,1.051576,3.059796,-1.943918,-8.798559,-0.837958,-1.752162,-0.160423
3,2,4,4,6,4,5,4,3,4,4,...,2.06928,1.073938,-0.926957,-2.948424,-0.940204,-0.943918,-6.798559,-0.647482,-1.411621,-1.160423
4,6,3,2,4,6,4,5,5,3,7,...,-0.93072,1.073938,3.073043,-0.948424,0.059796,0.056082,-6.798559,-0.647482,-1.411621,-0.160423


## ----------End of FE

In [14]:
m_sc = MinMaxScaler()
X_scaled = m_sc.fit_transform(X)
test_scaled = m_sc.transform(test)

## Training

In [15]:
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

## XGB

In [16]:
from xgboost import XGBRegressor

In [17]:
# model_XGB = XGBRegressor(n_estimators = 1000, learning_rate = 0.01, max_depth = 20, device = "gpu")
# model_XGB.fit(X_train, y_train, verbose = True)

In [18]:
# y_pred = model_XGB.predict(X_val)
# r2 = r2_score(y_val, y_pred)
# print(f"R2 : {r2}")

## XGB gpu

In [20]:
model_XGB_GPU = xgb.XGBRegressor(tree_method='gpu_hist', predictor='gpu_predictor', n_estimators=1000, learning_rate = 0.01, max_depth = 20)
model_XGB_GPU.fit(X_train, y_train)
xgb_pred = model_XGB_GPU.predict(X_val)
r2_score(y_val, xgb_pred)

KeyboardInterrupt: 

In [None]:
model_XGB_GPU.feature_importances_

In [21]:
dmat = xgb.DMatrix(X_train, y_train)
model_XGB_GPU  = xgb.train({"tree_method": "gpu_hist"}, dmat, 500)

In [22]:
model_XGB_GPU.set_param({"predictor": "gpu_predictor"})

In [23]:
test_dmat = xgb.DMatrix(X_val)
y_pred = model_XGB_GPU.predict(test_dmat)
r2 = r2_score(y_val, y_pred)
print(f"R2 : {r2}")

R2 : 0.8685730693763309


In [27]:
model_XGB_GPU.get_score(importance_type='cover')

{'f0': 65801.5390625,
 'f1': 82027.2734375,
 'f2': 87637.0,
 'f3': 78499.625,
 'f4': 93780.28125,
 'f5': 89772.3046875,
 'f6': 85755.984375,
 'f7': 98274.1875,
 'f8': 85661.2109375,
 'f9': 90844.7890625,
 'f10': 100262.3203125,
 'f11': 92810.171875,
 'f12': 93436.8984375,
 'f13': 100974.1328125,
 'f14': 92625.375,
 'f15': 96657.359375,
 'f16': 84249.0546875,
 'f17': 91965.28125,
 'f18': 98812.6953125,
 'f19': 91058.859375,
 'f20': 149659.140625,
 'f22': 154186.046875,
 'f23': 184117.015625}

## LGBM

In [174]:
model_LGBM = lgb.LGBMRegressor(n_estimators = 1000, learning_rate = 0.01, max_depth = 20, device= "gpu")
model_LGBM.fit(X_train, y_train)

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1512
[LightGBM] [Info] Number of data points in the train set: 934365, number of used features: 48
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 48 dense feature groups (42.77 MB) transferred to GPU in 0.028308 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 0.504274


In [176]:
y_pred = model_LGBM.predict(X_val)
r2 = r2_score(y_val, y_pred)
print(f"R2 : {r2}")

R2 : 0.8700231976400025


# ANN

In [178]:
# import torch
# from torch import nn
# from torch.nn import functional as F

# class RegressionNet(nn.Module):
#     def __init__(self, n_features):
#         super(RegressionNet, self).__init__()
#         self.hidden0 = nn.Linear(n_features, 4096)
#         self.hidden1 = nn.Linear(4096, 2048)
#         self.hidden2 = nn.Linear(2048, 1024)
#         self.hidden3 = nn.Linear(1024, 512)
#         self.hidden4 = nn.Linear(512, 264)
#         self.hidden5 = nn.Linear(264, 128)
#         self.hidden6 = nn.Linear(128, 64)
#         self.hidden7 = nn.Linear(64, 32)
#         self.output = nn.Linear(32, 1)
    
#     def forward(self, x):
        
#         x = F.leaky_relu(self.hidden0(x))
#         x = F.dropout(x, p=0.2)
        
#         x = F.leaky_relu(self.hidden1(x))
#         x = F.dropout(x, p=0.2)

#         x = F.leaky_relu(self.hidden2(x))
#         x = F.dropout(x, p=0.2)

#         x = F.leaky_relu(self.hidden3(x))
#         x = F.dropout(x, p=0.2)

#         x = F.leaky_relu(self.hidden4(x))
#         x = F.dropout(x, p=0.2)

#         x = F.leaky_relu(self.hidden5(x))
#         x = F.dropout(x, p=0.2)

#         x = F.leaky_relu(self.hidden6(x))
#         x = F.dropout(x, p=0.2)

#         x = F.leaky_relu(self.hidden7(x))
#         x = F.dropout(x, p=0.2)

#         return self.output(x)

In [179]:
# from torch.utils.data import Dataset, DataLoader

# class CustomDataset(Dataset):
#     def __init__(self, features, labels):
#         self.features = features
#         self.labels = labels
    
#     def __len__(self):
#         return len(self.features)
    
#     def __getitem__(self, idx):
#         x = self.features[idx]
#         y = self.labels[idx]
#         return x, y

In [180]:
# n_features = X.shape[1]
# n_features

48

In [182]:
# X_train_tensor = torch.from_numpy(X_train.astype(np.float32))
# y_train_tensor = torch.tensor(y_train.values.astype(np.float32))
# X_val_tensor = torch.from_numpy(X_val.astype(np.float32))

In [183]:
# dataset = CustomDataset(X_train_tensor, y_train_tensor)
# batch_size = 4096
# data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=2)

In [188]:
# device = "cuda" if torch.cuda.is_available() else "cpu"

In [189]:
# from tqdm.auto import tqdm

# model = RegressionNet(n_features)
# model.to(device)

# loss_fn = nn.MSELoss()

# optimizer = torch.optim.Adam(model.parameters())

# loss_vals = []

# # pb = tqdm(total = 50, desc = "Training")

# model.train()

# for epoch in tqdm(range(30)):
#     epoch_loss = 0
#     for batch_idx, (data, target) in enumerate(data_loader):
#         data, target = data.to(device), target.to(device)

#         # Forward pass
#         y_pred = model(data)
#         loss = loss_fn(y_pred.squeeze(), target)

#         # Backward pass and optimize
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()

#         epoch_loss += loss.item()
    
#     avg_loss = epoch_loss / len(data_loader)
#     loss_vals.append(avg_loss)
#     print(f"Epoch: {epoch+1}, Loss: {avg_loss:.4f}")

  0%|          | 0/30 [00:00<?, ?it/s]

Epoch: 1, Loss: 0.0229
Epoch: 2, Loss: 0.0046
Epoch: 3, Loss: 0.0033
Epoch: 4, Loss: 0.0025
Epoch: 5, Loss: 0.0019
Epoch: 6, Loss: 0.0015
Epoch: 7, Loss: 0.0012
Epoch: 8, Loss: 0.0009
Epoch: 9, Loss: 0.0007
Epoch: 10, Loss: 0.0006
Epoch: 11, Loss: 0.0006
Epoch: 12, Loss: 0.0005
Epoch: 13, Loss: 0.0005
Epoch: 14, Loss: 0.0005
Epoch: 15, Loss: 0.0005
Epoch: 16, Loss: 0.0004
Epoch: 17, Loss: 0.0004
Epoch: 18, Loss: 0.0004
Epoch: 19, Loss: 0.0004
Epoch: 20, Loss: 0.0004
Epoch: 21, Loss: 0.0004
Epoch: 22, Loss: 0.0004
Epoch: 23, Loss: 0.0004
Epoch: 24, Loss: 0.0004
Epoch: 25, Loss: 0.0004
Epoch: 26, Loss: 0.0004
Epoch: 27, Loss: 0.0004
Epoch: 28, Loss: 0.0004
Epoch: 29, Loss: 0.0004
Epoch: 30, Loss: 0.0004


In [190]:
# model.eval()
# class CustomDataset(Dataset):
#     def __init__(self, features):
#         self.features = features
    
#     def __len__(self):
#         return len(self.features)
    
#     def __getitem__(self, idx):
#         x = self.features[idx]
#         return x


# test_dataset = CustomDataset(X_val_tensor)
# test_loader = DataLoader(test_dataset, batch_size=4096, shuffle=False, num_workers=2)

# all_predictions = []

# with torch.no_grad():
#     for data in test_loader:
#         data = data.to(device)
#         predictions = model(data)
#         all_predictions.append(predictions.cpu().numpy())  # to CPU and convert to numpy array

In [198]:
# from itertools import chain

# def flatten_list(data):
#   """
#   Flattens a list using itertools.chain.from_iterable.

#   Args:
#       data: The list to flatten.

#   Returns:
#       A new list containing the flattened elements.
#   """
#   return list(chain.from_iterable(data))

In [201]:
# temp = flatten_list(all_predictions)
# y_pred = list(chain.from_iterable(temp))

In [202]:
# r2 = r2_score(y_val, y_pred)
# print(f"R2 : {r2}")

R2 : 0.8490342906904439


In [None]:
# r2 = r2_score(y_val, all_predictions)
# print(f"R2 : {r2}")

## ENSEMBLE

In [28]:
params = {
    'tree_method': 'gpu_hist',  
    'predictor': 'gpu_predictor'
}

In [30]:
dtrain_xgb = xgb.DMatrix(X_train, label=y_train)
dtest_xgb = xgb.DMatrix(X_val, label=y_val)

# base models
xgb_model = xgb.train(params, dtrain_xgb, num_boost_round=1000)
lgb_model = lgb.LGBMRegressor(n_estimators=1000, learning_rate=0.01, max_depth=20, device='gpu')
lgb_model.fit(X_train, y_train)

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1512
[LightGBM] [Info] Number of data points in the train set: 934365, number of used features: 48
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 48 dense feature groups (42.77 MB) transferred to GPU in 0.029459 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 0.504274


In [31]:
# predictions for base models
xgb_pred_train = xgb_model.predict(dtrain_xgb)
xgb_pred_test = xgb_model.predict(dtest_xgb)
lgb_pred_train = lgb_model.predict(X_train)
lgb_pred_test = lgb_model.predict(X_val)



In [33]:
from sklearn.linear_model import Ridge
X_train_meta = np.column_stack((xgb_pred_train, lgb_pred_train))
X_test_meta = np.column_stack((xgb_pred_test, lgb_pred_test))

# meta-model
meta_model = Ridge()
meta_model.fit(X_train_meta, y_train)

In [34]:
stacked_pred = meta_model.predict(X_test_meta)
r2_score(y_val, stacked_pred)

0.8589512815140579

In [217]:
import lightgbm as lgb

In [222]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge

estimators = [
    ("xgb", xgb.XGBRegressor(tree_method='gpu_hist', predictor = "gpu_predictor", n_estimators=1000, learning_rate = 0.01, max_depth = 20)),
    ("lgb", lgb.LGBMRegressor(n_estimators = 1000, learning_rate = 0.01, max_depth = 20, device= "gpu"))
]

In [223]:
stacking_regressor = StackingRegressor(
    estimators=estimators,
    final_estimator=Ridge()
)

In [None]:
stacking_regressor.fit(X_train, y_train)
stacking_pred = stacking_regressor.predict(X_val)
r2_score(y_val, stacking_pred)

In [None]:
import joblib

joblib.dump(stacking_regressor, 'stacking_regressor.pkl')

## Sunmission file

In [None]:
stacking_regressor = joblib.load('stacking_regressor.pkl')

In [184]:
# y_pred = model_LGBM.predict(test_scaled)
# y_pred



array([0.70538213, 0.49700729, 0.49705553, ..., 0.70282576, 0.69647877,
       0.55870028])

In [203]:
y_pred = model_LGBM.predict(test_scaled)
y_pred



array([0.57687863, 0.45294887, 0.44947166, ..., 0.61764198, 0.54867445,
       0.52575875])

In [204]:
len(y_pred)

745305

In [186]:
test_data_id = pd.read_csv(r"/kaggle/input/playground-series-s4e5/test.csv")["id"]
submission = pd.DataFrame(list(zip(test_data_id, y_pred)), columns = ["id", "FloodProbability"])
submission.to_csv("submission.csv", index = False)