In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('train.csv', index_col='id')

In [3]:
def cleaning(dataset):
    features = dataset.columns.tolist()
    features.remove('FloodProbability')
    dataset['mean'] = dataset[features].mean(axis=1)
    dataset['std'] = dataset[features].std(axis=1)
    dataset['max'] = dataset[features].max(axis=1)
    dataset['min'] = dataset[features].min(axis=1)
    dataset['median'] = dataset[features].median(axis=1)    
    dataset['range'] = dataset['max'] - dataset['min']
    dataset['variance'] = dataset[features].var(axis=1)
    dataset['skewness'] = dataset[features].skew(axis=1)
    dataset['kurtosis'] = dataset[features].kurtosis(axis=1)
    dataset['sum'] = dataset[features].sum(axis=1)  
    dataset = dataset.drop(features, axis=1)
    return dataset

In [16]:
def add_features(df):
    BASE_FEATURES = df.columns
    BASE_FEATURES.remove('FloodProbability')
    df['total'] = df[BASE_FEATURES].sum(axis=1)
    df['mean'] = df[BASE_FEATURES].mean(axis=1)
    df['std'] = df[BASE_FEATURES].std(axis=1)
    df['max'] = df[BASE_FEATURES].max(axis=1)
    df['min'] = df[BASE_FEATURES].min(axis=1)
    df['median'] = df[BASE_FEATURES].median(axis=1)
    df['ptp'] = df[BASE_FEATURES].values.ptp(axis=1)
    df['q25'] = df[BASE_FEATURES].quantile(0.25, axis=1)
    df['q75'] = df[BASE_FEATURES].quantile(0.75, axis=1)
    
    df['ClimateImpact'] = df['MonsoonIntensity'] + df['ClimateChange']
    df['AnthropogenicPressure'] = df['Deforestation'] + df['Urbanization'] + df['AgriculturalPractices'] + df['Encroachments']
    df['InfrastructureQuality'] = df['DamsQuality'] + df['DrainageSystems'] + df['DeterioratingInfrastructure']
    df['CoastalVulnerabilityTotal'] = df['CoastalVulnerability'] + df['Landslides']
    df['PreventiveMeasuresEfficiency'] = df['RiverManagement'] + df['IneffectiveDisasterPreparedness'] + df['InadequatePlanning']
    df['EcosystemImpact'] = df['WetlandLoss'] + df['Watersheds']
    df['SocioPoliticalContext'] = df['PopulationScore'] * df['PoliticalFactors']
    return df

In [7]:
# df = cleaning(train)
df = add_features(train)

In [8]:
# X = df.drop(columns=['FloodProbability'])
# y = df['FloodProbability']
X = df
y = train['FloodProbability']

### Train Test Split

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
from sklearn.metrics import r2_score

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf_reg = RandomForestRegressor(n_estimators=63, random_state=42)
rf_reg.fit(X_train_scaled, y_train)

In [None]:
y_pred = rf_reg.predict(X_test_scaled)
r2_score(y_test, y_pred)

### XGBoost Regressor

In [12]:
xgb_params ={'n_estimators':1200,
 'max_depth': 6,
 'learning_rate': 0.039987569,
 }

In [13]:
from xgboost import XGBRegressor
xgb_model_test = XGBRegressor(**xgb_params)
xgb_model_test.fit(X_train_scaled, y_train)

In [15]:
y_pred = xgb_model_test.predict(X_test_scaled)
r2_score(y_test, y_pred)

0.9999999967186478

### CatBoost Regressor

In [None]:
catboost_params ={'depth': 3,                   
                  'random_state':42,
                  'task_type': 'CPU', 
                  'eval_metric': 'RMSE', 
                  'min_data_in_leaf': 4, 
                  'loss_function': 'RMSE',
                  'grow_policy': 'Lossguide', 
                  'bootstrap_type': 'Bernoulli',
                  'subsample': 0.83862137638162, 
                  'l2_leaf_reg': 8.365422739510098, 
                  'random_strength': 3.296124856352495, 
                  'learning_rate': 0.0983, 
}

In [None]:
from catboost import CatBoostRegressor
cat_model_test = CatBoostRegressor(**catboost_params)
cat_model_test.fit(X_train_scaled, y_train, verbose=100)

In [None]:
y_pred = cat_model.predict(X_test_scaled)
r2_score(y_test, y_pred)

### Light GBM Regressor

In [None]:
lgbm_params = {
    'num_leaves': 183, 
    'learning_rate': 0.01183688880802108, 
    'n_estimators': 577, 
    'subsample_for_bin': 165697, 
    'min_child_samples': 114, 
    'reg_alpha': 2.075080888948164e-06, 
    'reg_lambda': 3.838938366471552e-07, 
    'colsample_bytree': 0.9634044234652241, 
    'subsample': 0.9592138618622019, 
    'max_depth': 9,
    'random_state':42,
    'verbose': -1
}

In [None]:
from lightgbm import LGBMRegressor
lgbm_model_test = LGBMRegressor(**lgbm_params)
lgbm_model_test.fit(X_train_scaled, y_train)

In [None]:
y_pred = lgbm_reg.predict(X_test_scaled)
r2_score(y_test, y_pred)

### Train Model With ALL Data

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
xgb_model = XGBRegressor(**xgb_params)
xgb_model.fit(X_scaled, y)

In [None]:
cat_model = CatBoostRegressor(**catboost_params)
cat_model.fit(X_scaled, y, verbose=100)

In [None]:
lgbm_model = LGBMRegressor(**lgbm_params)
lgbm_model.fit(X_scaled, y)

In [None]:
test = pd.read_csv('test.csv', index_col='id')

In [None]:
# test = cleaning(test)
test = add_features(test)

In [None]:
test_scaled = scaler.transform(test)

In [None]:
xgb_pred = xgb_model.predict(test_scaled)
cat_pred = cat_model.predict(test_scaled)
lgbm_pred = lgbm_model.predict(test_scaled)

In [None]:
output = pd.read_csv('sample_submission.csv')

In [None]:
output['FloodProbability'] = lgbm_pred *0.77 + cat_pred*0.155 + xgb_pred*0.075 #{'xgb_val': 0.075, 'cat_val': 0.155, 'lgbm_val': 0.77}

In [None]:
output.head()

In [None]:
output.to_csv('output.csv', index=False)

### Optuna

In [None]:
import optuna

In [None]:
xgb_test_pred = xgb_model_test.predict(X_test_scaled)
cat_test_pred = cat_model_test.predict(X_test_scaled)
lgbm_test_pred = lgbm_model_test.predict(X_test_scaled)

In [None]:
def objective(trail):
    xgb_val = trail.suggest_float("xgb_val",low=0,high=1,step=0.001)
    cat_val = trail.suggest_float("cat_val",low=0,high=1,step=0.001)
    lgbm_val = trail.suggest_float("lgbm_val",low=0,high=1,step=0.001)
    avg_pred = lgbm_test_pred *lgbm_val + cat_test_pred*cat_val + xgb_test_pred*xgb_val
    return r2_score(y_test, avg_pred);

In [None]:
study = optuna.create_study(direction='maximize')

In [None]:
study.optimize(objective, n_trials=1000)

In [None]:
study.best_value

In [None]:
study.best_params