 #### <p style="background-color: #EDE7F6;color:#6600ff;display: inline-block;padding:.6rem;border-radius:.5rem">Import libraries</p>

In [30]:
# import cupy as cp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import optuna
import gc

from time import time
from joblib import load, dump
from matplotlib.ticker import MaxNLocator,MultipleLocator
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import Ridge
from sklearn.ensemble import VotingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.base import clone
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from tqdm import tqdm
# import sklearn.metrics as skm
from sklearn.metrics import mean_absolute_error

import warnings

 #### <p style="background-color: #EDE7F6;color:#6600ff;display: inline-block;padding:.6rem;border-radius:.5rem">Settings</p>

In [None]:
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100
pd.options.display.max_colwidth = 1000
# pd.options.display.precision = 5
warnings.filterwarnings('ignore')

 #### <p style="background-color: #EDE7F6;color:#6600ff;display: inline-block;padding:.6rem;border-radius:.5rem">Import data files</p>

In [31]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
orig_features = train.columns[1:-1].to_list()
train.shape, test.shape

((1117957, 22), (745305, 21))

In [32]:
# For output text color
end = "\033[0m" ; bold = "\033[1m" ; LightRed = "\033[91m" ; LightBlue = "\033[94m"
BlueBold = LightBlue + bold ; RedBold = LightRed + bold

In [33]:
X = train.iloc[:,1:-1]
y = train['FloodProbability']
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

#### <p style="background-color: #EDE7F6; color: #6600ff;margin:0; display:inline-block;padding:.6rem;border-radius:.25rem;">A.Model Training</p>

In [34]:
X_train, X_valid, y_train, y_valid = train_test_split(X,y, test_size = 0.2, random_state=42)

#### <p style="background-color: #EDE7F6; color: #6600ff;margin:0; display:inline-block;padding:.6rem;border-radius:.25rem;">A1.XGBRegressor</p>

In [35]:
# xgb_params = {'grow_policy': 'depthwise', 'tree_method': 'hist', 'enable_categorical': True, 'gamma': 0, 'n_estimators': 768, 'learning_rate': 0.026111403303690425, 'max_depth': 8, 'reg_lambda': 26.648168065161098, 'min_child_weight': 1.0626186255116183, 'subsample': 0.8580490989206254, 'colsample_bytree': 0.5125814118774029}
model_XGB = XGBRegressor(random_state=42)

In [36]:
model_XGB.fit(X_train, y_train)

In [37]:
predictionsXGB = model_XGB.predict(X_valid)

In [38]:
print(f'MAE: {mean_absolute_error(predictionsXGB, y_valid)}')
print(f'Train model score: {model_XGB.score(X_train, y_train)}')
print(f'Valid model score: {model_XGB.score(X_valid, y_valid)}')

MAE: 0.017587966576462263
Train model score: 0.8182502525072329
Valid model score: 0.8090669608359393


#### <p style="background-color: #EDE7F6; color: #6600ff;margin:0; display:inline-block;padding:.6rem;border-radius:.25rem;">Feature Engineering</p>

In [40]:
unique = []
for col in train.iloc[:,1:-1].columns:
    unique.extend(train[col].unique())
unique = list(set(unique))

In [41]:
def make_features(data, train_data=True, scaler=None):

    df = data.copy()
    with tqdm(total=12 ,desc='processing...') as pbar:
        sort_cols = [f'col{i+1}' for i in range(len(orig_features))]
        
        df['sum'] = df.sum(axis=1)         
        pbar.update(1)
        
        df['linear_ft'] = df['sum'].isin(np.arange(72, 76)).astype(int)
        pbar.update(1)
        
        df['median'] = df[orig_features].median(axis=1)         
        pbar.update(1)
        
        df['std'] = df[orig_features].std(axis=1)         
        pbar.update(1)
        
        df[sort_cols] = np.sort(df[orig_features].values, axis=1)         
        pbar.update(1)
        
        for u in unique:
            df[f'count_{u}'] = (df[orig_features] == u).sum(axis=1)
        pbar.update(1)
        
        df['mean'] = df[orig_features].mean(axis=1)
        pbar.update(1)
        
        df['max'] = df[orig_features].max(axis=1) 
        pbar.update(1)               
        
        df['min'] = df[orig_features].min(axis=1)
        pbar.update(1)
        
        df['skew'] = df[orig_features].skew(axis=1)
        pbar.update(1)
        
        df['kurt'] = df[orig_features].kurt(axis=1)
        pbar.update(1)
        
        if train_data:
            scaler = StandardScaler()
            scaler.fit(df)
            df[df.columns.to_list()] = scaler.transform(df.values)
            pbar.update(1)
        else:
            df[df.columns.to_list()] = scaler.transform(df.values)
            pbar.update(1)
        
    return df, scaler

In [42]:
train_df, scaler = make_features(X)
dump(train_df,'train_df.joblib')
train_df.head()

processing...: 100%|███████████████████████████████████████████████████████████████████| 12/12 [00:06<00:00,  1.95it/s]


Unnamed: 0,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,Encroachments,...,count_15,count_16,count_17,count_18,count_19,mean,max,min,skew,kurt
0,0.038198,1.467769,0.021561,1.490363,0.507578,-0.453941,-0.458881,-0.933107,-0.939124,-0.455632,...,-0.044877,-0.025772,-0.008025,-0.002502,-0.000946,-0.576389,-0.967722,0.473994,0.351678,-0.55252
1,0.524488,0.990186,-0.461022,-0.459251,1.467552,1.489938,-0.938943,0.034951,-0.455692,0.504372,...,-0.044877,-0.025772,-0.008025,-0.002502,-0.000946,-0.576389,-0.201344,-1.92012,-0.518854,-0.416219
2,0.524488,0.035021,0.504143,1.002959,-0.932383,1.003968,-1.899068,0.034951,-0.455692,0.02437,...,-0.044877,-0.025772,-0.008025,-0.002502,-0.000946,0.025493,-0.967722,-0.723063,-1.648163,-0.736429
3,-0.934382,-0.442562,0.504143,0.028152,-0.452396,1.489938,-0.458881,1.00301,0.511171,1.464377,...,-0.044877,-0.025772,-0.008025,-0.002502,-0.000946,0.627374,-0.967722,0.473994,-0.575221,-0.609961
4,0.038198,-0.920145,-1.426186,0.515556,-0.452396,-0.453941,-0.938943,-0.933107,-0.939124,-0.935635,...,-0.044877,-0.025772,-0.008025,-0.002502,-0.000946,-3.224667,-2.500477,-0.723063,-0.305973,-0.886395


In [43]:
test_df, _ = make_features(test.iloc[:,1:], train_data=False, scaler=scaler)
dump(test_df,'test_df.joblib')
test_df.head()

processing...: 100%|███████████████████████████████████████████████████████████████████| 12/12 [00:03<00:00,  3.28it/s]


Unnamed: 0,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,Encroachments,...,count_15,count_16,count_17,count_18,count_19,mean,max,min,skew,kurt
0,-0.448092,0.512604,-0.943604,0.028152,0.507578,1.003968,1.461369,1.00301,1.478035,-0.455632,...,-0.044877,-0.025772,-0.008025,-0.002502,-0.000946,1.470008,-0.967722,1.67105,-0.679136,-0.985432
1,-0.448092,-0.442562,-1.426186,1.977766,0.027591,0.032029,-0.458881,1.00301,0.02774,-0.455632,...,-0.044877,-0.025772,-0.008025,-0.002502,-0.000946,-1.057894,-0.201344,-0.723063,0.209093,0.408414
2,-1.906963,-0.920145,0.504143,0.028152,0.987565,-1.425881,-0.458881,0.518981,-0.455692,-1.415637,...,-0.044877,-0.025772,-0.008025,-0.002502,-0.000946,-1.057894,-0.201344,-0.723063,-0.202562,-0.814604
3,-1.420672,-0.442562,-0.461022,0.515556,-0.452396,0.032029,-0.458881,-0.933107,-0.455692,-0.455632,...,-0.044877,-0.025772,-0.008025,-0.002502,-0.000946,-0.817142,-0.967722,0.473994,0.155842,-0.230696
4,0.524488,-0.920145,-1.426186,-0.459251,0.507578,-0.453941,0.021182,0.034951,-0.939124,0.984374,...,-0.044877,-0.025772,-0.008025,-0.002502,-0.000946,-0.817142,-0.967722,0.473994,-0.363181,-0.384101


#### <p style="background-color: #EDE7F6; color: #6600ff;margin:0; display:inline-block;padding:.6rem;border-radius:.25rem;">B.Model Training</p>

In [None]:
# Lets check the model performances again now after adding the new features

In [66]:
pd.set_option("display.max_columns", 100)

In [68]:
test_df.tail(3)

Unnamed: 0,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,Encroachments,IneffectiveDisasterPreparedness,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors,sum,linear_ft,median,std,col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14,col15,col16,col17,col18,col19,col20,count_0,count_1,count_2,count_3,count_4,count_5,count_6,count_7,count_8,count_9,count_10,count_11,count_12,count_13,count_14,count_15,count_16,count_17,count_18,count_19,mean,max,min,skew,kurt
745302,0.038198,0.990186,1.95189,0.028152,0.027591,0.517999,0.981307,0.034951,0.02774,-0.935635,0.507551,2.920915,-0.935421,2.920014,-0.446137,0.035884,1.963421,0.023754,0.028548,-0.449209,2.553395,-0.092641,0.394861,0.776859,1.67105,1.003264,1.8842,1.452382,2.583041,2.180019,1.814241,1.425875,1.000899,0.576117,0.15525,-0.264347,0.909327,0.426602,1.393773,0.832034,2.800666,1.941122,3.014061,1.331411,-0.305101,-0.69286,-1.273444,-0.570042,-1.042446,2.257939,-0.602373,0.055926,-1.255373,1.785797,-0.52331,5.81619,-0.196533,-0.113623,-0.075162,-0.044877,-0.025772,-0.008025,-0.002502,-0.000946,2.553395,1.331411,1.67105,1.29322,0.180421
745303,-0.448092,0.990186,0.504143,-0.946654,0.027591,-1.425881,-0.938943,1.487039,0.511171,0.984374,0.507551,0.508175,1.458186,0.514185,-1.40657,-0.932727,1.481302,0.990547,0.028548,0.02918,0.988503,-0.092641,2.166655,-0.26696,0.473994,-0.3433,0.44014,-0.070144,-0.551742,0.581143,1.814241,1.425875,1.000899,2.211724,1.815964,1.389766,0.909327,0.426602,1.393773,0.832034,0.240347,0.778443,-0.05788,-0.967722,-0.305101,-0.69286,0.377739,0.071814,-1.603117,-0.485763,1.296003,0.813103,1.71964,-0.837315,-0.52331,-0.326605,-0.196533,-0.113623,-0.075162,-0.044877,-0.025772,-0.008025,-0.002502,-0.000946,0.988503,-0.967722,0.473994,-1.552198,-0.929215
745304,-0.448092,-1.397727,0.021561,-0.946654,1.467552,-0.453941,0.021182,-0.933107,0.02774,0.504372,-0.454848,-1.422018,0.500743,2.438849,-0.446137,-0.932727,1.963421,1.473944,0.509058,-0.927598,0.145869,-0.092641,-0.491036,0.653843,0.473994,-0.3433,0.44014,-0.070144,-0.551742,-1.017734,0.159595,-0.253589,-0.651513,-1.05949,0.15525,-0.264347,-0.665863,0.426602,-0.057234,-0.546571,1.520507,0.778443,0.9661,0.565033,-0.305101,-0.69286,0.377739,0.71367,0.078896,-0.485763,0.030419,-1.45843,0.727969,0.474241,1.434197,-0.326605,-0.196533,-0.113623,-0.075162,-0.044877,-0.025772,-0.008025,-0.002502,-0.000946,0.145869,0.565033,0.473994,0.713524,-0.293814


In [44]:
X = train_df
y = train['FloodProbability']

In [45]:
X_train, X_valid, y_train, y_valid = train_test_split(X,y, test_size = 0.2, random_state=42)

In [53]:
param = {'n_estimators': 309, 'learning_rate': 0.019965529664439288, 'max_depth': 10, 'subsample': 0.5927215723384559, 'colsample_bytree': 0.7745532703076283, 'min_child_weight': 4}

In [54]:
model_XGB = XGBRegressor(**param, random_state=42)

In [55]:
model_XGB.fit(X_train, y_train)

In [56]:
predictionsXGB = model_XGB.predict(X_valid)

In [57]:
print(f'MAE: {mean_absolute_error(predictionsXGB, y_valid)}')
print(f'Train model score: {model_XGB.score(X_train, y_train)}')
print(f'Valid model score: {model_XGB.score(X_valid, y_valid)}')

MAE: 0.014317103673191573
Train model score: 0.8756855817587479
Valid model score: 0.8690340201096587


In [None]:
# wynik po skaler
# MAE: 0.014357870706917238
# Train model score: 0.8717652993754489
# Valid model score: 0.8686313577548649

#### <p style="background-color: #EDE7F6; color: #6600ff;margin:0; display:inline-block;padding:.6rem;border-radius:.25rem;">Model Training and Hyperparameter Tuning</p>

In [None]:
fixed_params_xgr = {
    'device'      : 'cuda',
}

In [50]:
def objective(trial):
    params = {
        'tree_method' : 'hist',
        "objective": "reg:squarederror",
        "n_estimators": trial.suggest_int("n_estimators", 50, 500),
        "verbosity": 0,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 1e-1, log=True),
        "max_depth": trial.suggest_int("max_depth", 1, 12),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
        'n_jobs'      : -1
    }
    
    gc.collect()
    model_xgr = XGBRegressor(**params)
    model_xgr.fit(X_train, y_train, verbose=False)
    predictions = model_xgr.predict(X_valid)
#     rmse = mean_absolute_error(y_valid, predictions)
    rmse = model_xgr.score(X_valid, y_valid)
    return rmse 

In [58]:

TPESampler = optuna.samplers.TPESampler(multivariate=True, group=True)
optimize_r2_xgr = optuna.create_study(direction='maximize', sampler=TPESampler, study_name='Optimizing R2 for XGBRegressor')

[I 2024-07-01 15:46:52,231] A new study created in memory with name: Optimizing R2 for XGBRegressor


In [59]:
optimize_r2_xgr.optimize(objective, timeout=3*60*60, show_progress_bar=True)
dump(optimize_r2_xgr,'Optimizing R2 for XGBRegressor.pkl')

   0%|          | 00:00/3:00:00

[I 2024-07-01 15:47:11,393] Trial 0 finished with value: 0.8484999880175219 and parameters: {'n_estimators': 218, 'learning_rate': 0.008810901786961568, 'max_depth': 8, 'subsample': 0.7543427791908845, 'colsample_bytree': 0.5846111960957807, 'min_child_weight': 16}. Best is trial 0 with value: 0.8484999880175219.
[I 2024-07-01 15:47:15,208] Trial 1 finished with value: 0.4909879798390754 and parameters: {'n_estimators': 147, 'learning_rate': 0.014168242675228485, 'max_depth': 1, 'subsample': 0.663827043997119, 'colsample_bytree': 0.05385151450752884, 'min_child_weight': 18}. Best is trial 0 with value: 0.8484999880175219.
[I 2024-07-01 15:47:32,376] Trial 2 finished with value: 0.8662195923862297 and parameters: {'n_estimators': 437, 'learning_rate': 0.010123739402336109, 'max_depth': 5, 'subsample': 0.6200006278126002, 'colsample_bytree': 0.44199197318317607, 'min_child_weight': 16}. Best is trial 2 with value: 0.8662195923862297.
[I 2024-07-01 15:47:41,498] Trial 3 finished with valu

[I 2024-07-01 15:52:02,258] Trial 26 finished with value: 0.8689108254716421 and parameters: {'n_estimators': 410, 'learning_rate': 0.062026708713004336, 'max_depth': 7, 'subsample': 0.7441089606600911, 'colsample_bytree': 0.6667529198394252, 'min_child_weight': 1}. Best is trial 25 with value: 0.8689837306214232.
[I 2024-07-01 15:52:16,314] Trial 27 finished with value: 0.8687452005350037 and parameters: {'n_estimators': 263, 'learning_rate': 0.09750412057003673, 'max_depth': 8, 'subsample': 0.914111644908902, 'colsample_bytree': 0.5524568191855426, 'min_child_weight': 4}. Best is trial 25 with value: 0.8689837306214232.
[I 2024-07-01 15:52:31,522] Trial 28 finished with value: 0.8689531801566263 and parameters: {'n_estimators': 261, 'learning_rate': 0.017490428117630717, 'max_depth': 8, 'subsample': 0.6012467459873743, 'colsample_bytree': 0.997171088409746, 'min_child_weight': 1}. Best is trial 25 with value: 0.8689837306214232.
[I 2024-07-01 15:52:36,836] Trial 29 finished with valu

KeyboardInterrupt: 

In [51]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

[I 2024-07-01 15:35:33,760] A new study created in memory with name: no-name-bb466c53-e431-410e-a29c-664c0df4fa9f
[I 2024-07-01 15:36:08,438] Trial 0 finished with value: 0.7415181937878526 and parameters: {'n_estimators': 481, 'learning_rate': 0.0020351535981315136, 'max_depth': 10, 'subsample': 0.2955148652065257, 'colsample_bytree': 0.5755069774385734, 'min_child_weight': 1}. Best is trial 0 with value: 0.7415181937878526.
[I 2024-07-01 15:36:12,715] Trial 1 finished with value: 0.29015542383488824 and parameters: {'n_estimators': 174, 'learning_rate': 0.0023191922989200904, 'max_depth': 1, 'subsample': 0.21475201180857723, 'colsample_bytree': 0.39212694051280955, 'min_child_weight': 16}. Best is trial 0 with value: 0.7415181937878526.
[I 2024-07-01 15:36:27,005] Trial 2 finished with value: 0.8323569127433665 and parameters: {'n_estimators': 416, 'learning_rate': 0.005322665480144706, 'max_depth': 4, 'subsample': 0.5438529422316266, 'colsample_bytree': 0.24868032787179817, 'min_chi

[I 2024-07-01 15:43:41,534] Trial 26 finished with value: 0.8690914809351639 and parameters: {'n_estimators': 289, 'learning_rate': 0.022862892361005034, 'max_depth': 8, 'subsample': 0.634196643692571, 'colsample_bytree': 0.8787938169255539, 'min_child_weight': 5}. Best is trial 22 with value: 0.8691021305807042.
[I 2024-07-01 15:43:51,616] Trial 27 finished with value: 0.8685273205135076 and parameters: {'n_estimators': 273, 'learning_rate': 0.04498516975938397, 'max_depth': 5, 'subsample': 0.7575170389111546, 'colsample_bytree': 0.8876239966307936, 'min_child_weight': 6}. Best is trial 22 with value: 0.8691021305807042.
[I 2024-07-01 15:44:04,325] Trial 28 finished with value: 0.868940298190295 and parameters: {'n_estimators': 231, 'learning_rate': 0.023176095648422313, 'max_depth': 7, 'subsample': 0.49924473125218494, 'colsample_bytree': 0.9870113511901213, 'min_child_weight': 3}. Best is trial 22 with value: 0.8691021305807042.
[I 2024-07-01 15:44:15,046] Trial 29 finished with val

In [52]:
print('Best hyperparameters:', study.best_params)
print('Best RMSE:', study.best_value)

Best hyperparameters: {'n_estimators': 309, 'learning_rate': 0.019965529664439288, 'max_depth': 10, 'subsample': 0.5927215723384559, 'colsample_bytree': 0.7745532703076283, 'min_child_weight': 4}
Best RMSE: 0.8691021305807042


#### <p style="background-color: #EDE7F6; color: #6600ff;margin:0; display:inline-block;padding:.6rem;border-radius:.25rem;">Ensemble Prediction</p>

In [None]:
# Getting the best parameters obtained from Hyperparameter Tuning
xgr_params = optimize_r2_xgr.best_trial.params
cbr_params = optimize_r2_cbr.best_trial.params
lbr_params = optimize_r2_lbr.best_trial.params
xgrf_params = optimize_r2_xgrf.best_trial.params
ridge_params = optimize_r2_ridge.best_trial.params

In [None]:
xgr_tr_pred1, xgr_ts_pred1 = cross_validate_predict(XGBRegressor(**fixed_params_xgr, **xgr_params),
                                                    X, y, test_df)

cbr_tr_pred2, cbr_ts_pred2 = cross_validate_predict(CatBoostRegressor(**fixed_params_ctr, **cbr_params),
                                                    X, y, test_df)

lgr_tr_pred3, lgr_ts_pred3 = cross_validate_predict(LGBMRegressor(**fixed_params_lbr, **lbr_params),
                                                    X, y, test_df)

xgrf_tr_pred4, xgrf_ts_pred4 = cross_validate_predict(XGBRegressor(**fixed_params_xgrf, **xgrf_params),
                                                      X, y, test_df)

ridge_tr_pred5, ridge_ts_pred5 = cross_validate_predict(Ridge(**ridge_params), X, y, test_df)

In [None]:
# Creating the predictions dataset for final prediction
ensemble_train = np.c_[xgr_tr_pred1, cbr_tr_pred2, lgr_tr_pred3, xgrf_tr_pred4, ridge_tr_pred5]
ensemble_test = np.c_[xgr_ts_pred1, cbr_ts_pred2, lgr_ts_pred3, xgrf_ts_pred4, ridge_ts_pred5]

np.save('ensemble_train.npy', ensemble_train)
np.save('ensemble_test.npy', ensemble_test)

In [None]:
ensemble_train[:5,:]

In [None]:
ensemble_test[:5,:]

#### <p style="background-color: #EDE7F6; color: #6600ff;margin:0; display:inline-block;padding:.6rem;border-radius:.25rem;">Optimizing The Weights For Final Predictions</p>

In [None]:
def objective(trial):
    
    weights = [trial.suggest_float(f'w{i}', 0, 1) for i in range(1,6)]   # weight suggestions
    weights = np.array(weights) / np.sum(weights)                        # normalizing
    
    ensemble_pred = ensemble_train.dot(weights)
    r2_score = skm.r2_score(y, ensemble_pred)
    return r2_score

TPESampler = optuna.samplers.TPESampler(multivariate=True, group=True)
optimize_weights = optuna.create_study(direction='maximize', sampler=TPESampler, study_name='Ensemble Weights')

In [None]:
optimize_weights.optimize(objective, n_trials=1000, show_progress_bar=True)
dump(optimize_weights,'Ensemble Weights.pkl')

In [None]:
weights = optimize_weights.best_trial.params
weights

In [None]:
# Normalizing the weights
weights = np.array(list(weights.values()))
weights /= np.sum(weights)
weights

#### <p style="background-color: #EDE7F6; color: #6600ff;margin:0; display:inline-block;padding:.6rem;border-radius:.25rem;">Making The Final Test Predictions</p>

In [72]:
predictionsXGB_test = model_XGB.predict(test_df)

In [73]:
predictionsXGB_test

array([0.57759094, 0.4582304 , 0.44923756, ..., 0.6219937 , 0.5490062 ,
       0.5299143 ], dtype=float32)

In [70]:
test.shape

(745305, 21)

In [74]:
submission = pd.DataFrame()
submission['id'] = test['id']
submission['FloodProbability'] = predictionsXGB_test
submission.to_csv('submission.csv',index=False)

In [75]:
submission

Unnamed: 0,id,FloodProbability
0,1117957,0.577591
1,1117958,0.458230
2,1117959,0.449238
3,1117960,0.468001
4,1117961,0.467441
...,...,...
745300,1863257,0.476886
745301,1863258,0.440631
745302,1863259,0.621994
745303,1863260,0.549006
