In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('../data/train.csv')

In [3]:
# drop column id
data.drop('id', axis=1, inplace=True)

In [4]:
X = data.drop('FloodProbability', axis=1)
y = data['FloodProbability']

In [41]:
# split into train and test
from sklearn.model_selection import train_test_split
state = 104
# divide the data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=state)
# split test set into validation and test sets (10% validation, 10% test)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=state)

In [44]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

xgb_reg = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=150, learning_rate=0.1, max_depth=10)

# Fit the model
xgb_reg.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=10, verbose=False)

# Predict probabilities
y_pred = xgb_reg.predict(X_test)

# Evaluate the model

print(f"Train RMSE: {mean_squared_error(y_train, xgb_reg.predict(X_train))}, R^2: {xgb_reg.score(X_train, y_train)}")
print(f"Validation RMSE: {mean_squared_error(y_val, xgb_reg.predict(X_val))}, R^2: {xgb_reg.score(X_val, y_val)}")
print(f"Test RMSE: {mean_squared_error(y_test, y_pred)}, R^2: {xgb_reg.score(X_test, y_test)}")




Train RMSE: 0.0003496629040175699, R^2: 0.8658058133332887
Validation RMSE: 0.0004758559031297264, R^2: 0.8168884355068833
Test RMSE: 0.0004698922000499628, R^2: 0.8192206499336518


In [45]:
test_data_orig = pd.read_csv('../data/test.csv')
test_data_orig.head()

Unnamed: 0,id,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,...,IneffectiveDisasterPreparedness,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors
0,1117957,4,6,3,5,6,7,8,7,8,...,8,5,7,5,6,3,6,4,4,5
1,1117958,4,4,2,9,5,5,4,7,5,...,2,4,7,4,5,1,7,4,4,3
2,1117959,1,3,6,5,7,2,4,6,4,...,7,9,2,5,5,2,3,6,8,3
3,1117960,2,4,4,6,4,5,4,3,4,...,7,8,4,6,7,6,4,2,4,4
4,1117961,6,3,2,4,6,4,5,5,3,...,4,3,2,6,4,6,8,4,5,5


In [46]:
# ensure the columns are in the same order as the training data
test_data =  test_data_orig[X.columns]
test_data.head()

Unnamed: 0,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,Encroachments,IneffectiveDisasterPreparedness,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors
0,4,6,3,5,6,7,8,7,8,4,8,5,7,5,6,3,6,4,4,5
1,4,4,2,9,5,5,4,7,5,4,2,4,7,4,5,1,7,4,4,3
2,1,3,6,5,7,2,4,6,4,2,7,9,2,5,5,2,3,6,8,3
3,2,4,4,6,4,5,4,3,4,4,7,8,4,6,7,6,4,2,4,4
4,6,3,2,4,6,4,5,5,3,7,4,3,2,6,4,6,8,4,5,5


In [47]:
# predict probabilities
test_pred = xgb_reg.predict(test_data)

In [48]:
# append the predictions as price to the test data and save the id and price columns
test_data_orig['FloodProbability'] = test_pred

In [49]:
# save the test data with the predictions as a csv file. add date to the filename
from datetime import datetime
file_name = f'first_xgboost_predictions_{datetime.now().strftime("%Y%m%d%H%M%S")}.csv'
test_data_orig[['id', 'FloodProbability']].to_csv(f'../data/{file_name}', index=False)

In [50]:
!ls -lrt .

total 338128
-rw-r--r--  1 gbemidebe  staff  36098277 Apr  2 15:54 test.csv
-rw-r--r--  1 gbemidebe  staff   8943680 Apr  2 15:54 sample_submission.csv
-rw-r--r--  1 gbemidebe  staff  59127674 Apr  2 15:54 train.csv
-rw-r--r--  1 gbemidebe  staff  13786613 May 31 17:11 first_xgboost_predictions_20240531171158.csv
-rw-r--r--  1 gbemidebe  staff  13789561 May 31 17:13 first_xgboost_predictions_20240531171325.csv
-rw-r--r--  1 gbemidebe  staff  13779873 May 31 17:18 first_xgboost_predictions_20240531171818.csv
-rw-r--r--  1 gbemidebe  staff  13789561 May 31 17:21 first_xgboost_predictions_20240531172113.csv
-rw-r--r--  1 gbemidebe  staff  13789205 May 31 18:10 first_xgboost_predictions_20240531181032.csv
