<a href="https://www.kaggle.com/code/klyushnik/keras-model?scriptVersionId=180882279" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s4e5/sample_submission.csv
/kaggle/input/playground-series-s4e5/train.csv
/kaggle/input/playground-series-s4e5/test.csv


# previously - https://www.kaggle.com/code/klyushnik/blend-ensemble-machine-learning

In [2]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Image
%matplotlib inline

# load data, make inform

In [3]:
train = pd.read_csv('/kaggle/input/playground-series-s4e5/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s4e5/test.csv')

print('Shape train is', train.shape, 'shape test is', test.shape)

Shape train is (1117957, 22) shape test is (745305, 21)


In [4]:
train = train.drop(columns = ['id'])
test = test.drop(columns = ['id'])

print('Shape train is', train.shape, 'shape test is', test.shape)

Shape train is (1117957, 21) shape test is (745305, 20)


In [5]:
X = train.drop(columns = ['FloodProbability'],axis =1)

y = train['FloodProbability']
X_total = test

print(f"X shape :{X.shape} , y shape :{y.shape}, X_total shape :{X_total.shape}")

X = X.astype ('int32')
X_total = X_total.astype('int32')

X_df = X.copy()
X_df_test = X_total.copy()

X shape :(1117957, 20) , y shape :(1117957,), X_total shape :(745305, 20)


# add new data

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin

class AggFeatureExtractor(BaseEstimator, TransformerMixin):
    
    def __init__(self, group_col, agg_col, agg_func):
        self.group_col = group_col
        self.group_col_name = ''
        for col in group_col:
            self.group_col_name += col
        self.agg_col = agg_col
        self.agg_func = agg_func
        self.agg_df = None
        self.medians = None
        
    def fit(self, X, y=None):
        group_col = self.group_col
        agg_col = self.agg_col
        agg_func = self.agg_func
        
        self.agg_df = X.groupby(group_col)[agg_col].agg(agg_func)
        self.agg_df.columns = [f'{self.group_col_name}_{agg}_{_agg_col}' for _agg_col in agg_col for agg in agg_func]
        self.medians = X[agg_col].median()
        
        return self
    
    def transform(self, X):
        group_col = self.group_col
        agg_col = self.agg_col
        agg_func = self.agg_func
        agg_df = self.agg_df
        medians = self.medians
        
        X_merged = pd.merge(X, agg_df, left_on=group_col, right_index=True, how='left')
        X_merged.fillna(medians, inplace=True)
        X_agg = X_merged.loc[:, [f'{self.group_col_name}_{agg}_{_agg_col}' for _agg_col in agg_col for agg in agg_func]]
        
        return X_agg
    
    def fit_transform(self, X, y=None):
        self.fit(X, y)
        X_agg = self.transform(X)
        return X_agg

In [7]:
class Preprocessor:
    def __init__(self, agg_col, agg_func, group_cols):
        self.agg_col = agg_col
        self.agg_func = agg_func
        self.group_cols = group_cols
        
    def preprocess(self, X, X_total):    
        agg_train, agg_test = [], []
        for group_col in self.group_cols:
            agg_extractor = AggFeatureExtractor(group_col=group_col, agg_col=self.agg_col, agg_func=self.agg_func)
            agg_extractor.fit(pd.concat([X, X_total], axis=0))
            agg_train.append(agg_extractor.transform(X))
            agg_test.append(agg_extractor.transform(X_total))
        X = pd.concat([X] + agg_train, axis=1)
        X_total = pd.concat([X_total] + agg_test, axis=1)
        
        X = X.fillna(0)
        X_total = X_total.fillna(0)
        return X, X_total

In [8]:
list_of_choice = ['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement',
       'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality',
       'Siltation', 'AgriculturalPractices', 'Encroachments',
       'IneffectiveDisasterPreparedness', 'DrainageSystems',
       'CoastalVulnerability', 'Landslides', 'Watersheds',
       'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss',
       'InadequatePlanning', 'PoliticalFactors']

In [9]:
agg_col = list_of_choice

agg_func = ['mean', 'std', 'median']

group_cols = [
  ['MonsoonIntensity', 'TopographyDrainage'],
  [ 'RiverManagement', 'Deforestation'],
  ['AgriculturalPractices', 'Encroachments'],
  ['IneffectiveDisasterPreparedness', 'DrainageSystems'],
  ['CoastalVulnerability', 'Landslides'],              
  ['Watersheds', 'DeterioratingInfrastructure'],
  ['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement'],
  ['CoastalVulnerability', 'Landslides', 'Watersheds'],
  ['DeterioratingInfrastructure','PopulationScore', 'WetlandLoss'],
  ['Urbanization', 'ClimateChange', 'Watersheds'], 
  ['MonsoonIntensity', 'DrainageSystems','PoliticalFactors']
             ]

pp = Preprocessor(agg_col, agg_func, group_cols)
X, X_total = pp.preprocess(X, X_total)

print(f"X shape :{X.shape} , y shape :{y.shape}, X_test shape :{X_total.shape}")

X shape :(1117957, 680) , y shape :(1117957,), X_test shape :(745305, 680)


# threshold

In [10]:
from sklearn.feature_selection import VarianceThreshold

def variance_threshold(df,th):
    var_thres=VarianceThreshold(threshold=th)
    var_thres.fit(df)
    new_cols = var_thres.get_support()
    return df.iloc[:,new_cols]

In [11]:
X = variance_threshold(X,0.01)
list_name = (X.columns)
X_total = X_total[list_name]

print(f"X shape :{X.shape} , y shape :{y.shape}, X_total shape :{X_total.shape}")

X shape :(1117957, 201) , y shape :(1117957,), X_total shape :(745305, 201)


# principal component analysis (PCA)

In [12]:
from sklearn.decomposition import PCA

pca = PCA(n_components=1)
principalComponents = pca.fit_transform(X)
principalComponents_test = pca.fit_transform(X_total)

principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1'])
principalDftest = pd.DataFrame(data = principalComponents_test
             , columns = ['principal component 1'])

X_df['principal component_1'] = principalDf
X_df_test['principal component_1'] = principalDftest
print(f"X shape :{X_df.shape} , y shape :{y.shape}, X_test shape :{X_df_test.shape}")

X shape :(1117957, 21) , y shape :(1117957,), X_test shape :(745305, 21)


In [13]:
X = X_df
X_total = X_df_test
print(f"X shape :{X.shape} , y shape :{y.shape}, X_test shape :{X_total.shape}")

X shape :(1117957, 21) , y shape :(1117957,), X_test shape :(745305, 21)


# Keras model

In [14]:
from sklearn.model_selection import KFold
import keras
from keras import layers
import tensorflow as tf

# ! pip install scikeras
from sklearn.model_selection import train_test_split

from sklearn.model_selection import StratifiedKFold, KFold, StratifiedGroupKFold, RepeatedStratifiedKFold,RepeatedKFold, cross_validate
from keras.layers import BatchNormalization, Flatten, Dense, Dropout
import keras
import keras_tuner
from tensorflow.keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import cross_val_score
from optuna.trial import TrialState

2024-06-01 07:12:44.354264: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-01 07:12:44.354494: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-01 07:12:44.519704: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [15]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver() # TPU detection
except ValueError:
    tpu = None
    gpus = tf.config.experimental.list_logical_devices("GPU")
    
if tpu:
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu,) 
    print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
elif len(gpus) > 1:
    strategy = tf.distribute.MirroredStrategy([gpu.name for gpu in gpus])
    print('Running on multiple GPUs ', [gpu.name for gpu in gpus])
elif len(gpus) == 1:
    strategy = tf.distribute.get_strategy() 
    print('Running on single GPU ', gpus[0].name)
else:
    strategy = tf.distribute.get_strategy() 
    print('Running on CPU')
print("Number of accelerators: ", strategy.num_replicas_in_sync)

Running on CPU
Number of accelerators:  1


# learn model

In [16]:
input_shape = [X.shape[1]]

model = keras.Sequential([  
    layers.BatchNormalization(input_shape=input_shape),
    
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.BatchNormalization(),

    layers.Dropout(0.2),
    layers.Dense(256, activation='relu'),
    layers.BatchNormalization(),
    
    layers.Dropout(0.2),
    layers.Dense(256, activation='relu'),
    layers.BatchNormalization(),
    
    layers.Dropout(0.2),
    layers.Dense(128, activation='relu'),
    layers.BatchNormalization(),
    
    layers.Dropout(0.2),
    layers.Dense(64, activation='relu'),
    layers.BatchNormalization(),
    
    layers.Dropout(0.2),
    layers.Dense(1, activation='sigmoid'),
])
model.summary()

  super().__init__(**kwargs)


In [17]:
from sklearn.metrics import mean_squared_error

model = model
predictions_kears = np.zeros(len(X_total))

FOLDs = KFold(n_splits=2, shuffle=True,random_state=42)

for fold_, (trn_idx, val_idx) in enumerate(FOLDs.split(X,y)):
    X.iloc[trn_idx], y.iloc[trn_idx]
    X.iloc[val_idx], y.iloc[val_idx]

    early_stopping = keras.callbacks.EarlyStopping(monitor="mse", patience=5,
                                                   min_delta=0.001,
                                                   restore_best_weights=True,)
    reduce_lr = keras.callbacks.ReduceLROnPlateau(factor = 0.1, patience = 5, mode = 'min', verbose = 1,)
    
    model.compile(loss='mean_squared_logarithmic_error', optimizer='adam', metrics=['mse'])
    
    model.fit(X.iloc[trn_idx], y.iloc[trn_idx],
            batch_size=100,
            epochs=20, 
            validation_data=(X.iloc[val_idx], y.iloc[val_idx]),
            callbacks=[reduce_lr,early_stopping],)
    
    y_pred = model.predict(X.iloc[val_idx])  
    scores = mean_squared_error(y_pred, y.iloc[val_idx], squared = False)    #model.evaluate(X[test], y[test], verbose=0)
       
    predictions_kears = model.predict(X_total)
    
    print(scores)

Epoch 1/20
[1m5590/5590[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 11ms/step - loss: 0.0026 - mse: 0.0059 - val_loss: 1.8635e-04 - val_mse: 4.1734e-04 - learning_rate: 0.0010
Epoch 2/20
[1m5590/5590[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 11ms/step - loss: 2.2947e-04 - mse: 5.1436e-04 - val_loss: 1.7576e-04 - val_mse: 3.9149e-04 - learning_rate: 0.0010
Epoch 3/20
[1m5590/5590[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 11ms/step - loss: 2.1545e-04 - mse: 4.8339e-04 - val_loss: 1.6704e-04 - val_mse: 3.7420e-04 - learning_rate: 0.0010
Epoch 4/20
[1m5590/5590[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 11ms/step - loss: 2.1043e-04 - mse: 4.7202e-04 - val_loss: 1.6946e-04 - val_mse: 3.8028e-04 - learning_rate: 0.0010
Epoch 5/20
[1m5590/5590[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 11ms/step - loss: 2.0738e-04 - mse: 4.6532e-04 - val_loss: 1.6575e-04 - val_mse: 3.7105e-04 - learning_rate: 0.0010
Epoch 6/20
[1m5587/5590[0m

# submit

In [18]:
sample = pd.read_csv('/kaggle/input/playground-series-s4e5/sample_submission.csv')
sample['FloodProbability'] = predictions_kears
sample.to_csv('submission.csv', index=False)
sample

Unnamed: 0,id,FloodProbability
0,1117957,0.576750
1,1117958,0.461900
2,1117959,0.456346
3,1117960,0.472944
4,1117961,0.472227
...,...,...
745300,1863257,0.477030
745301,1863258,0.456016
745302,1863259,0.624277
745303,1863260,0.554099
