In [12]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier, ExtraTreesClassifier, ExtraTreesRegressor
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
%run feature_engineering.ipynb

# Preparing ensemble
X = train.loc[:, train.columns != 'rainfall']
y = train['rainfall']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Hyperparameter tuning for Random Forest
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
rf = RandomizedSearchCV(RandomForestClassifier(), rf_param_grid, n_iter=20, cv=5, n_jobs=-1)
rf.fit(X_train, y_train)
best_rf = rf.best_estimator_

# Hyperparameter tuning for XGBoost
xgb_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1, 0.2]
}
xgb = RandomizedSearchCV(XGBClassifier(use_label_encoder=False, eval_metric='logloss'), xgb_param_grid, n_iter=20, cv=5, n_jobs=-1)
xgb.fit(X_train, y_train)
best_xgb = xgb.best_estimator_

# Adding LightGBM
lgb = LGBMClassifier(n_estimators=200)
lgb.fit(X_train, y_train)

# ExtraTrees model
et_model = ExtraTreesClassifier(n_estimators=100, random_state=42)
et_model.fit(X_train, y_train)


# Ensemble Model

# ensemble = StackingClassifier(
#     estimators=[
#         ('rf', best_rf),
#         ('xgb', best_xgb),
#         ('lgb', lgb), 
#         ('et', et_model)
#     ],
#     final_estimator=LogisticRegression(),
#     cv=5  # Cross-validation for meta-model
# )


ensemble = VotingClassifier(estimators=[
    ('rf', best_rf),
    ('xgb', best_xgb),
    ('lgb', lgb), 
    ('et', et_model)
], voting='soft', weights=[2, 3, 3, 3])  # Weighted based on model performance
    
ensemble.fit(X_train, y_train)
ensemble_acc = ensemble.score(X_test, y_test)
print(f"Ensemble Model Accuracy: {ensemble_acc:.4f}")

# Define Neural Network
class NeuralNet(nn.Module):
    def __init__(self, input_size):
        super(NeuralNet, self).__init__()
        self.layer1 = nn.Linear(input_size, 128)
        self.batchnorm1 = nn.BatchNorm1d(128)
        self.layer2 = nn.Linear(128, 64)
        self.batchnorm2 = nn.BatchNorm1d(64)
        self.layer3 = nn.Linear(64, 32)
        self.output = nn.Linear(32, 1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
    
    def forward(self, x):
        x = self.relu(self.batchnorm1(self.layer1(x)))
        x = self.dropout(x)
        x = self.relu(self.batchnorm2(self.layer2(x)))
        x = self.dropout(x)
        x = self.relu(self.layer3(x))
        x = self.output(x)
        return torch.sigmoid(x)

# Convert data for PyTorch
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

dataset = TensorDataset(X_train_tensor, y_train_tensor)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Train Neural Network
input_size = X_train.shape[1]
model = NeuralNet(input_size)
optimizer = optim.AdamW(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5, factor=0.5)
criterion = nn.BCELoss()

def train_nn(model, dataloader, optimizer, criterion, epochs=100):
    for epoch in range(epochs):
        total_loss = 0
        for batch in dataloader:
            X_batch, y_batch = batch
            optimizer.zero_grad()
            y_pred = model(X_batch)
            loss = criterion(y_pred, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        scheduler.step(total_loss)
        if epoch % 10 == 0:
            print(f"Epoch [{epoch}/{epochs}], Loss: {total_loss:.4f}")

train_nn(model, dataloader, optimizer, criterion)

# Evaluate Neural Network
y_pred_test = model(X_test_tensor).detach().numpy()
y_pred_test = (y_pred_test > 0.5).astype(int)
nn_acc = np.mean(y_pred_test == y_test_tensor.numpy())
print(f"Neural Network Accuracy: {nn_acc:.4f}")

Our features are:
['day', 'pressure', 'maxtemp', 'temperature', 'mintemp', 'dewpoint', 'humidity', 'cloud', 'sunshine', 'winddirection', 'windspeed', 'year_group', 'temperature_range', 'seasonal_sin']
There are 91 interaction features:
['day_pressure', 'day_maxtemp', 'day_temperature', 'day_mintemp', 'day_dewpoint', 'day_humidity', 'day_cloud', 'day_sunshine', 'day_winddirection', 'day_windspeed', 'day_year_group', 'day_temperature_range', 'day_seasonal_sin', 'pressure_maxtemp', 'pressure_temperature', 'pressure_mintemp', 'pressure_dewpoint', 'pressure_humidity', 'pressure_cloud', 'pressure_sunshine', 'pressure_winddirection', 'pressure_windspeed', 'pressure_year_group', 'pressure_temperature_range', 'pressure_seasonal_sin', 'maxtemp_temperature', 'maxtemp_mintemp', 'maxtemp_dewpoint', 'maxtemp_humidity', 'maxtemp_cloud', 'maxtemp_sunshine', 'maxtemp_winddirection', 'maxtemp_windspeed', 'maxtemp_year_group', 'maxtemp_temperature_range', 'maxtemp_seasonal_sin', 'temperature_mintemp', 't

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


[LightGBM] [Info] Number of positive: 1331, number of negative: 421
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000388 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6171
[LightGBM] [Info] Number of data points in the train set: 1752, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.759703 -> initscore=1.151053
[LightGBM] [Info] Start training from score 1.151053


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 1331, number of negative: 421
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000317 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6171
[LightGBM] [Info] Number of data points in the train set: 1752, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.759703 -> initscore=1.151053
[LightGBM] [Info] Start training from score 1.151053
Ensemble Model Accuracy: 0.8402
Epoch [0/100], Loss: 27.0120
Epoch [10/100], Loss: 17.5460
Epoch [20/100], Loss: 17.6133
Epoch [30/100], Loss: 16.9113
Epoch [40/100], Loss: 16.4862
Epoch [50/100], Loss: 15.5420
Epoch [60/100], Loss: 14.9985
Epoch [70/100], Loss: 14.4968
Epoch [80/100], Loss: 14.5966
Epoch [90/100], Loss: 13.7174
Neural Network Accuracy: 0.8402
