In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, log_loss, r2_score
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.svm import SVC, SVR
from sklearn.linear_model import ElasticNet, LinearRegression
from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import BaseEnsemble, VotingClassifier, GradientBoostingClassifier, RandomForestClassifier, StackingClassifier, StackingRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import warnings
warnings.filterwarnings('ignore')

In [20]:
train = pd.read_csv("train.csv", index_col=0)
print(train.isnull().sum().sum())
test = pd.read_csv("test.csv")
print(test.isnull().sum().sum())

0
0


In [21]:
X_train = train.drop('FloodProbability', axis=1)
y_train = train['FloodProbability']
X_test = test.drop('id', axis=1)


In [25]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1117957 entries, 0 to 1117956
Data columns (total 21 columns):
 #   Column                           Non-Null Count    Dtype  
---  ------                           --------------    -----  
 0   MonsoonIntensity                 1117957 non-null  int64  
 1   TopographyDrainage               1117957 non-null  int64  
 2   RiverManagement                  1117957 non-null  int64  
 3   Deforestation                    1117957 non-null  int64  
 4   Urbanization                     1117957 non-null  int64  
 5   ClimateChange                    1117957 non-null  int64  
 6   DamsQuality                      1117957 non-null  int64  
 7   Siltation                        1117957 non-null  int64  
 8   AgriculturalPractices            1117957 non-null  int64  
 9   Encroachments                    1117957 non-null  int64  
 10  IneffectiveDisasterPreparedness  1117957 non-null  int64  
 11  DrainageSystems                  1117957 non-null  int6

In [30]:
lr = LinearRegression()
xgboost = XGBRegressor(random_state=24)
ela = ElasticNet(random_state=24)
cat = CatBoostRegressor(random_state=24)
light = LGBMRegressor(random_state=24)

stack = StackingRegressor([('LR', lr), ('LIG', light), ('CAT', cat), ('ELA', ela)], final_estimator=light, passthrough=True)



In [31]:
stack.fit(X_train, y_train)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.084916 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 347
[LightGBM] [Info] Number of data points in the train set: 1117957, number of used features: 20
[LightGBM] [Info] Start training from score 0.504480
Learning rate set to 0.124117
0:	learn: 0.0501323	total: 231ms	remaining: 3m 50s
1:	learn: 0.0492413	total: 311ms	remaining: 2m 35s
2:	learn: 0.0483603	total: 390ms	remaining: 2m 9s
3:	learn: 0.0475582	total: 467ms	remaining: 1m 56s
4:	learn: 0.0467916	total: 545ms	remaining: 1m 48s
5:	learn: 0.0460372	total: 621ms	remaining: 1m 42s
6:	learn: 0.0453183	total: 697ms	remaining: 1m 38s
7:	learn: 0.0446522	total: 773ms	remaining: 1m 35s
8:	learn: 0.0439859	total: 854ms	remaining: 1m 34s
9:	learn: 0.0433289	total: 929ms	remaining: 1m 32s
10:	learn: 0.0427493	total: 1s	remaining: 1m 30s
11:	learn: 0.0421723	total: 1.07s	remaining: 1m 28s
12:	learn: 0.0416

In [32]:
y_pred = stack.predict(X_test)




In [33]:
submit = pd.DataFrame({'id':test['id'],
                       'FloodProbability':y_pred})
submit.to_csv("stacking2.csv", index=False)

In [29]:
stack.get_params()

{'cv': None,
 'estimators': [('LR', LinearRegression()),
  ('XGB',
   XGBRegressor(base_score=None, booster=None, callbacks=None,
                colsample_bylevel=None, colsample_bynode=None,
                colsample_bytree=None, device=None, early_stopping_rounds=None,
                enable_categorical=False, eval_metric=None, feature_types=None,
                gamma=None, grow_policy=None, importance_type=None,
                interaction_constraints=None, learning_rate=None, max_bin=None,
                max_cat_threshold=None, max_cat_to_onehot=None,
                max_delta_step=None, max_depth=None, max_leaves=None,
                min_child_weight=None, missing=nan, monotone_constraints=None,
                multi_strategy=None, n_estimators=None, n_jobs=None,
                num_parallel_tree=None, random_state=24, ...)),
  ('ELA', ElasticNet(random_state=24))],
 'final_estimator__boosting_type': 'gbdt',
 'final_estimator__class_weight': None,
 'final_estimator__colsample_

In [None]:
params = {'final_estimator__learning_rate': np.linspace(0.001, 0.9,10), 
          'max_depth': [None,3,2,4], 
          'n_estimators':[25, 50 ,100],  
          }