In [2]:
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, log_loss, r2_score
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.svm import SVC, SVR
from sklearn.linear_model import ElasticNet, LinearRegression
from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import BaseEnsemble, VotingClassifier, GradientBoostingClassifier, RandomForestClassifier, StackingClassifier, StackingRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import warnings
warnings.filterwarnings('ignore')

In [24]:
train = pd.read_csv("train.csv", index_col=0)
print(train.isnull().sum().sum())
test = pd.read_csv("test.csv")
print(test.isnull().sum().sum())

0
0


In [13]:

clf = IsolationForest(contamination=0.05, random_state=24)
clf.fit(train)
predictions = clf.predict(train)

print("%age of outliers=" + str((predictions < 0).mean()*100) + "%")
abn_ind = np.where(predictions < 0)
print("Outliers:")
print(train.index[abn_ind])

###### Visualization of Outliers ################
scaler = StandardScaler()
scaled_df = scaler.fit_transform(train)
prcomp = PCA()
scores = prcomp.fit_transform(scaled_df)

print(np.cumsum(prcomp.explained_variance_ratio_))

obs = np.where(predictions == -1, "Outlier", "Inlier")
PCs = pd.DataFrame({'PC1': scores[:, 0], 'PC2': scores[:, 1],
                    'Class': obs})

# sns.scatterplot(data=PCs, x='PC1',
#                 y='PC2', hue='Class')
# for i in np.arange(0, train.shape[0]):
#     plt.text(scores[i, 0], scores[i, 1],
#              list(train.index)[i], fontsize=6)
# plt.legend(loc='best')
# plt.title("PCA")
# plt.show()


%age of outliers=5.000013417331794%
Outliers:
Index([     28,      44,      71,      75,      83,     112,     130,     144,
           162,     170,
       ...
       1117452, 1117489, 1117536, 1117542, 1117651, 1117729, 1117754, 1117788,
       1117848, 1117859],
      dtype='int64', name='id', length=55898)
[0.08240734 0.13096451 0.17939833 0.22778371 0.27614492 0.32442495
 0.37266841 0.42088768 0.46909146 0.51726506 0.56535572 0.61340907
 0.66141748 0.70940676 0.75734804 0.80527667 0.85317254 0.90103792
 0.94886845 0.99657869 1.        ]


In [27]:
train.drop(train.index[abn_ind], axis=0, inplace=True)

In [28]:
train.shape

(1062059, 21)

In [29]:
X_train = train.drop('FloodProbability', axis=1)
y_train = train['FloodProbability']
X_test = test.drop('id', axis=1)

In [20]:

X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1117957 entries, 0 to 1117956
Data columns (total 2 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   PC1     1117957 non-null  float64
 1   PC2     1117957 non-null  float64
dtypes: float64(2)
memory usage: 17.1 MB


In [30]:
lr = LinearRegression()
xgboost = XGBRegressor(random_state=24)
ela = ElasticNet(random_state=24)
cat = CatBoostRegressor(random_state=24)
light = LGBMRegressor(random_state=24)

stack = StackingRegressor([('LR', lr), ('LIG', light), ('CAT', cat), ('ELA', ela)], final_estimator=light, passthrough=True)



In [31]:
stack.fit(X_train, y_train)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.088902 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 349
[LightGBM] [Info] Number of data points in the train set: 1062059, number of used features: 20
[LightGBM] [Info] Start training from score 0.503247
Learning rate set to 0.123115
0:	learn: 0.0486195	total: 89.4ms	remaining: 1m 29s
1:	learn: 0.0477685	total: 166ms	remaining: 1m 22s
2:	learn: 0.0469188	total: 243ms	remaining: 1m 20s
3:	learn: 0.0461669	total: 320ms	remaining: 1m 19s
4:	learn: 0.0454546	total: 394ms	remaining: 1m 18s
5:	learn: 0.0447472	total: 467ms	remaining: 1m 17s
6:	learn: 0.0440810	total: 541ms	remaining: 1m 16s
7:	learn: 0.0434518	total: 611ms	remaining: 1m 15s
8:	learn: 0.0428222	total: 681ms	remaining: 1m 14s
9:	learn: 0.0422032	total: 755ms	remaining: 1m 14s
10:	learn: 0.0416202	total: 820ms	remaining: 1m 13s
11:	learn: 0.0410568	total: 883ms	remaining: 1m 12s
12:	learn: 0

In [32]:
y_pred = stack.predict(X_test)


In [34]:
submit = pd.DataFrame({'id':test['id'],
                       'FloodProbability':y_pred})
submit.to_csv("Flood_wo_Outliers.csv", index=False)