In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, log_loss, r2_score
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.svm import SVC, SVR
from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import BaseEnsemble, VotingClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import warnings
warnings.filterwarnings('ignore')

In [26]:
train = pd.read_csv("train.csv", index_col=0)
print(train.isnull().sum().sum())
test = pd.read_csv("test.csv")
print(test.isnull().sum().sum())

0
0


In [27]:
X_train = train.drop('FloodProbability', axis=1)
y_train = train['FloodProbability']
X_test = test.drop('id', axis=1)

In [28]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1117957 entries, 0 to 1117956
Data columns (total 20 columns):
 #   Column                           Non-Null Count    Dtype
---  ------                           --------------    -----
 0   MonsoonIntensity                 1117957 non-null  int64
 1   TopographyDrainage               1117957 non-null  int64
 2   RiverManagement                  1117957 non-null  int64
 3   Deforestation                    1117957 non-null  int64
 4   Urbanization                     1117957 non-null  int64
 5   ClimateChange                    1117957 non-null  int64
 6   DamsQuality                      1117957 non-null  int64
 7   Siltation                        1117957 non-null  int64
 8   AgriculturalPractices            1117957 non-null  int64
 9   Encroachments                    1117957 non-null  int64
 10  IneffectiveDisasterPreparedness  1117957 non-null  int64
 11  DrainageSystems                  1117957 non-null  int64
 12  CoastalVulnerabilit

In [29]:
lightxbr = LGBMRegressor(random_state = 24)
xgbr = XGBRegressor(random_state = 24)
catbr = CatBoostRegressor(random_state=24)

In [30]:
lightxbr.fit(X_train, y_train)

y_pred = lightxbr.predict(X_test)
y_pred

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.070574 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 347
[LightGBM] [Info] Number of data points in the train set: 1117957, number of used features: 20
[LightGBM] [Info] Start training from score 0.504480


array([0.56419413, 0.4713082 , 0.47197475, ..., 0.59341963, 0.52839986,
       0.50642669])

In [31]:
xgbr.fit(X_train, y_train)

y_pred = xgbr.predict(X_test)
y_pred

array([0.5805085 , 0.46748444, 0.4461577 , ..., 0.6255179 , 0.55592906,
       0.5103532 ], dtype=float32)

In [32]:
print(y_pred)

[0.5805085  0.46748444 0.4461577  ... 0.6255179  0.55592906 0.5103532 ]


In [33]:
catbr.fit(X_train, y_train)

y_pred = catbr.predict(X_test)
y_pred

Learning rate set to 0.124117
0:	learn: 0.0501323	total: 91.6ms	remaining: 1m 31s
1:	learn: 0.0492413	total: 175ms	remaining: 1m 27s
2:	learn: 0.0483603	total: 252ms	remaining: 1m 23s
3:	learn: 0.0475582	total: 326ms	remaining: 1m 21s
4:	learn: 0.0467916	total: 401ms	remaining: 1m 19s
5:	learn: 0.0460372	total: 478ms	remaining: 1m 19s
6:	learn: 0.0453183	total: 553ms	remaining: 1m 18s
7:	learn: 0.0446522	total: 624ms	remaining: 1m 17s
8:	learn: 0.0439859	total: 703ms	remaining: 1m 17s
9:	learn: 0.0433289	total: 779ms	remaining: 1m 17s
10:	learn: 0.0427493	total: 854ms	remaining: 1m 16s
11:	learn: 0.0421723	total: 928ms	remaining: 1m 16s
12:	learn: 0.0416020	total: 997ms	remaining: 1m 15s
13:	learn: 0.0410540	total: 1.06s	remaining: 1m 14s
14:	learn: 0.0405212	total: 1.14s	remaining: 1m 14s
15:	learn: 0.0399941	total: 1.21s	remaining: 1m 14s
16:	learn: 0.0394779	total: 1.28s	remaining: 1m 13s
17:	learn: 0.0390007	total: 1.34s	remaining: 1m 13s
18:	learn: 0.0385215	total: 1.41s	remaining

array([0.57497565, 0.45427625, 0.45521798, ..., 0.62746408, 0.55000513,
       0.51143733])

In [34]:
kfold = KFold(n_splits=5, shuffle=True, random_state=24)
print(xgbr.get_params())
print(catbr.get_params())
print(lightxbr.get_params())

{'objective': 'reg:squarederror', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'feature_types': None, 'gamma': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': None, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': None, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': None, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': 24, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': None, 'subsample': None, 'tree_method': None, 'validate_parameters': None, 'verbosity': None}
{'loss_function': 'RMSE', 'random_state': 24}
{'boosting_type': 'gbdt', 'class_weight': None, 'colsam

In [35]:
params = {'learning_rate': np.linspace(0.001, 0.9,5), 
          'max_depth': [None,3,2,4], 
          'n_estimators':[25, 50 ,100],  
          }
xgbr_gcv = GridSearchCV(xgbr, param_grid=params, cv=kfold, 
                   scoring='r2', n_jobs=-1)
xgbr_gcv.fit(X_train, y_train)

print(xgbr_gcv.best_score_)
print(xgbr_gcv.best_params_)

0.8151720544063549
{'learning_rate': 0.22575, 'max_depth': None, 'n_estimators': 100}


In [36]:
y_pred = xgbr_gcv.best_estimator_.predict(X_test)
submit = pd.DataFrame({'id':test['id'],
                       'FloodProbability':y_pred})
submit.to_csv("xgboost.csv", index=False)

In [37]:
params = {'learning_rate': np.linspace(0.001, 0.9,5), 
          'max_depth': [None,3,2,4], 
          'n_estimators':[25, 50 ,100],  
          }
catbr_gcv = GridSearchCV(catbr, param_grid=params, cv=kfold, 
                   scoring='r2', n_jobs=-1)
catbr_gcv.fit(X_train, y_train)

print(catbr_gcv.best_score_)
print(catbr_gcv.best_params_)

0:	learn: 0.0471081	total: 91.8ms	remaining: 9.08s
1:	learn: 0.0428403	total: 176ms	remaining: 8.64s
2:	learn: 0.0379738	total: 260ms	remaining: 8.42s
3:	learn: 0.0350598	total: 398ms	remaining: 9.56s
4:	learn: 0.0333748	total: 472ms	remaining: 8.96s
5:	learn: 0.0315125	total: 549ms	remaining: 8.6s
6:	learn: 0.0298803	total: 619ms	remaining: 8.23s
7:	learn: 0.0290786	total: 685ms	remaining: 7.88s
8:	learn: 0.0282796	total: 751ms	remaining: 7.59s
9:	learn: 0.0275619	total: 817ms	remaining: 7.35s
10:	learn: 0.0269274	total: 881ms	remaining: 7.13s
11:	learn: 0.0262740	total: 945ms	remaining: 6.93s
12:	learn: 0.0257003	total: 1.01s	remaining: 6.75s
13:	learn: 0.0253393	total: 1.09s	remaining: 6.68s
14:	learn: 0.0249888	total: 1.17s	remaining: 6.6s
15:	learn: 0.0246614	total: 1.25s	remaining: 6.54s
16:	learn: 0.0243232	total: 1.33s	remaining: 6.49s
17:	learn: 0.0240090	total: 1.41s	remaining: 6.44s
18:	learn: 0.0236519	total: 1.49s	remaining: 6.37s
19:	learn: 0.0233652	total: 1.57s	remainin

In [38]:
y_pred = catbr_gcv.best_estimator_.predict(X_test)
submit = pd.DataFrame({'id':test['id'],
                       'FloodProbability':y_pred})
submit.to_csv("cat.csv", index=False)

In [39]:
params = {'learning_rate': np.linspace(0.001, 0.9,5), 
          'max_depth': [None,3,2,4], 
          'n_estimators':[25, 50 ,100],  
          }
lightxbr_gcv = GridSearchCV(lightxbr, param_grid=params, cv=kfold, 
                   scoring='r2', n_jobs=-1)
lightxbr_gcv.fit(X_train, y_train)

print(lightxbr_gcv.best_score_)
print(lightxbr_gcv.best_params_)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.084756 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 347
[LightGBM] [Info] Number of data points in the train set: 1117957, number of used features: 20
[LightGBM] [Info] Start training from score 0.504480
0.8190716778682757
{'learning_rate': 0.22575, 'max_depth': None, 'n_estimators': 100}


In [40]:
y_pred = lightxbr_gcv.best_estimator_.predict(X_test)
submit = pd.DataFrame({'id':test['id'],
                       'FloodProbability':y_pred})
submit.to_csv("light.csv", index=False)