In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, KFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score, log_loss, r2_score
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.svm import SVC, SVR
from sklearn.pipeline import Pipeline
from sklearn.ensemble import BaseEnsemble, VotingClassifier, VotingRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv(r'train.csv')
train.head()

Unnamed: 0,id,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,...,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors,FloodProbability
0,0,5,8,5,8,6,4,4,3,3,...,5,3,3,5,4,7,5,7,3,0.445
1,1,6,7,4,4,8,8,3,5,4,...,7,2,0,3,5,3,3,4,3,0.45
2,2,6,5,6,7,3,7,1,5,4,...,7,3,7,5,6,8,2,3,3,0.53
3,3,3,4,6,5,4,8,4,7,6,...,2,4,7,4,4,6,5,7,5,0.535
4,4,5,3,2,6,4,4,3,3,3,...,2,2,6,6,4,1,2,3,5,0.415


In [3]:
test = pd.read_csv(r'test.csv')

In [4]:
y = train['FloodProbability']
X = train.drop('FloodProbability', axis=1)

In [5]:
train.isna().sum()

id                                 0
MonsoonIntensity                   0
TopographyDrainage                 0
RiverManagement                    0
Deforestation                      0
Urbanization                       0
ClimateChange                      0
DamsQuality                        0
Siltation                          0
AgriculturalPractices              0
Encroachments                      0
IneffectiveDisasterPreparedness    0
DrainageSystems                    0
CoastalVulnerability               0
Landslides                         0
Watersheds                         0
DeterioratingInfrastructure        0
PopulationScore                    0
WetlandLoss                        0
InadequatePlanning                 0
PoliticalFactors                   0
FloodProbability                   0
dtype: int64

In [6]:
test.isna().sum()

id                                 0
MonsoonIntensity                   0
TopographyDrainage                 0
RiverManagement                    0
Deforestation                      0
Urbanization                       0
ClimateChange                      0
DamsQuality                        0
Siltation                          0
AgriculturalPractices              0
Encroachments                      0
IneffectiveDisasterPreparedness    0
DrainageSystems                    0
CoastalVulnerability               0
Landslides                         0
Watersheds                         0
DeterioratingInfrastructure        0
PopulationScore                    0
WetlandLoss                        0
InadequatePlanning                 0
PoliticalFactors                   0
dtype: int64

In [7]:
lr = LinearRegression()
rid = Ridge()
las = Lasso()
dtr = DecisionTreeRegressor(random_state=24)

In [8]:
voting = VotingRegressor([('LR', lr), ('RID', rid), ('LAS', las),
                           ('TREE', dtr)])

In [9]:
kfold = KFold(n_splits=5, shuffle=True, random_state=24)
print(voting.get_params())

{'estimators': [('LR', LinearRegression()), ('RID', Ridge()), ('LAS', Lasso()), ('TREE', DecisionTreeRegressor(random_state=24))], 'n_jobs': None, 'verbose': False, 'weights': None, 'LR': LinearRegression(), 'RID': Ridge(), 'LAS': Lasso(), 'TREE': DecisionTreeRegressor(random_state=24), 'LR__copy_X': True, 'LR__fit_intercept': True, 'LR__n_jobs': None, 'LR__positive': False, 'RID__alpha': 1.0, 'RID__copy_X': True, 'RID__fit_intercept': True, 'RID__max_iter': None, 'RID__positive': False, 'RID__random_state': None, 'RID__solver': 'auto', 'RID__tol': 0.0001, 'LAS__alpha': 1.0, 'LAS__copy_X': True, 'LAS__fit_intercept': True, 'LAS__max_iter': 1000, 'LAS__positive': False, 'LAS__precompute': False, 'LAS__random_state': None, 'LAS__selection': 'cyclic', 'LAS__tol': 0.0001, 'LAS__warm_start': False, 'TREE__ccp_alpha': 0.0, 'TREE__criterion': 'squared_error', 'TREE__max_depth': None, 'TREE__max_features': None, 'TREE__max_leaf_nodes': None, 'TREE__min_impurity_decrease': 0.0, 'TREE__min_sampl

In [10]:
params = {'RID__alpha': np.linspace(0.001, 3, 5),
'LAS__alpha':np.linspace(0.001, 3, 5),
'TREE__max_depth': [None,3,4,5], 
'TREE__min_samples_split': [2,4,5],
'TREE__min_samples_leaf': [1,4,5]}

In [11]:
rgcv = RandomizedSearchCV(voting, param_distributions=params, cv=kfold, random_state=24, scoring='r2', n_jobs=-1, n_iter=10)
rgcv.fit(X, y)

In [12]:
print(rgcv.best_score_)
print(rgcv.best_params_)

0.7951221117146317
{'TREE__min_samples_split': 2, 'TREE__min_samples_leaf': 4, 'TREE__max_depth': 5, 'RID__alpha': 3.0, 'LAS__alpha': 0.001}


In [13]:
gcv = GridSearchCV(voting, param_grid=params, cv=kfold, 
                   scoring='r2', n_jobs=-1)
gcv.fit(X, y)

In [None]:
print(gcv.best_score_)
print(gcv.best_params_)