In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv('../data/train.csv')

In [None]:
# drop column id
data.drop('id', axis=1, inplace=True)

In [None]:
X = data.drop('FloodProbability', axis=1)
y = data['FloodProbability']

In [None]:
# split into train and test
from sklearn.model_selection import train_test_split
# divide the data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# split test set into validation and test sets (10% validation, 10% test)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

xgb_reg = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=6)

# Fit the model
xgb_reg.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=10, verbose=False)

# Predict probabilities
y_pred = xgb_reg.predict(X_test)

# Evaluate the model

print(f"Train RMSE: {mean_squared_error(y_train, xgb_reg.predict(X_train))}, R^2: {xgb_reg.score(X_train, y_train)}")
print(f"Validation RMSE: {mean_squared_error(y_val, xgb_reg.predict(X_val))}, R^2: {xgb_reg.score(X_val, y_val)}")
print(f"Test RMSE: {mean_squared_error(y_test, y_pred)}, R^2: {xgb_reg.score(X_test, y_test)}")


In [None]:
test_data_orig = pd.read_csv('../data/test.csv')
test_data_orig.head()

In [None]:
# ensure the columns are in the same order as the training data
test_data =  test_data_orig[X.columns]
test_data.head()

In [None]:
# predict probabilities
test_pred = xgb_reg.predict(test_data)

In [None]:
# append the predictions as price to the test data and save the id and price columns
test_data_orig['FloodProbability'] = test_pred

In [None]:
# save the test data with the predictions as a csv file. add date to the filename
from datetime import datetime
file_name = f'first_xgboost_predictions_{datetime.now().strftime("%Y%m%d%H%M%S")}.csv'
test_data_orig[['id', 'FloodProbability']].to_csv(f'../data/{file_name}', index=False)

In [None]:
# submit the file to kaggle
os.chdir('../data')
!kaggle competitions submit -c playground-series-s4e5 -f file_name -m "XGBoost with draft submission"

In [None]:
!ls -lrt .