## Imports

In [50]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import sklearn

from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, f1_score, precision_score, recall_score

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import torch

In [57]:
## Loading test and train data
train_df = pd.read_csv("data/train_with_features.csv", index_col=0)

test_df = pd.read_csv("data/test_with_features.csv", index_col=0)

target_cols = ['rainfall']
feature_cols = [col for col in train_df.columns if col not in target_cols + ['id']]

display(test_df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 730 entries, 0 to 729
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 730 non-null    int64  
 1   day                730 non-null    int64  
 2   pressure           730 non-null    float64
 3   maxtemp            730 non-null    float64
 4   temparature        730 non-null    float64
 5   mintemp            730 non-null    float64
 6   dewpoint           730 non-null    float64
 7   humidity           730 non-null    float64
 8   cloud              730 non-null    float64
 9   sunshine           730 non-null    float64
 10  winddirection      730 non-null    float64
 11  windspeed          730 non-null    float64
 12  rainfall           730 non-null    float64
 13  month              730 non-null    int64  
 14  day_of_month       730 non-null    int64  
 15  rainfall_lag_730   730 non-null    float64
 16  rainfall_lag_1095  730 non-null

None

## Normalizing with min max scaler because all columns have an upper bound

In [58]:
scaler = MinMaxScaler().fit(train_df[feature_cols].copy())

train_df[feature_cols] = scaler.transform(train_df[feature_cols].copy())
test_df[feature_cols] = scaler.transform(test_df[feature_cols].copy())


## Random Forest Classifier

In [78]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate

model = RandomForestClassifier(n_estimators=2000, max_depth=10, random_state=42)

scorings = ['accuracy', 'f1', 'recall', 'precision', 'roc_auc']

cross_val_scores = cross_validate(model, train_df[feature_cols], train_df[target_cols].values.flatten(), cv=5, scoring=scorings, n_jobs=4, return_estimator=True)

print('Cross-validation scores:')
for scoring, score in cross_val_scores.items():
    if scoring == 'estimator':
        continue
    print(f'{scoring}: {score.mean():.4f} ± {score.std():.4f}')

test_preds = []
train_preds = []

for model in cross_val_scores['estimator']:
    train_preds.append(model.predict_proba(train_df[feature_cols])[:, 1])
    test_preds.append(model.predict_proba(test_df[feature_cols])[:, 1])

test_preds = np.array(test_preds).mean(axis=0)
train_preds = np.array(train_preds).mean(axis=0)

print(f'ROC AUC score on train: {roc_auc_score(train_df[target_cols], train_preds):.4f}')

test_df['rf_rainfall'] = test_preds

Cross-validation scores:
fit_time: 3.9057 ± 0.2350
score_time: 0.1810 ± 0.0098
test_accuracy: 0.8644 ± 0.0139
test_f1: 0.9121 ± 0.0083
test_recall: 0.9327 ± 0.0023
test_precision: 0.8924 ± 0.0152
test_roc_auc: 0.8843 ± 0.0200
ROC AUC score on train: 0.9906


## XGBoost

In [77]:
from xgboost import XGBClassifier

model = XGBClassifier(n_estimators=2000, max_depth=5, random_state=42)

cross_val_scores = cross_validate(model, train_df[feature_cols], train_df[target_cols].values, cv=5, scoring=scorings, n_jobs=4, return_estimator=True)

print('Cross-validation scores:')
for scoring, score in cross_val_scores.items():
    if scoring == 'estimator':
        continue
    print(f'{scoring}: {score.mean():.4f} ± {score.std():.4f}')

test_preds = []
train_preds = []

for model in cross_val_scores['estimator']:
    train_preds.append(model.predict_proba(train_df[feature_cols])[:, 1])
    test_preds.append(model.predict_proba(test_df[feature_cols])[:, 1])

test_preds = np.array(test_preds).mean(axis=0)
train_preds = np.array(train_preds).mean(axis=0)

print(f'ROC AUC score on train: {roc_auc_score(train_df[target_cols], train_preds):.4f}')

test_df['xgb_rainfall'] = test_preds


Cross-validation scores:
fit_time: 0.5387 ± 0.1180
score_time: 0.0179 ± 0.0043
test_accuracy: 0.8511 ± 0.0089
test_f1: 0.9033 ± 0.0051
test_recall: 0.9224 ± 0.0104
test_precision: 0.8852 ± 0.0144
test_roc_auc: 0.8597 ± 0.0179
ROC AUC score on train: 1.0000


## KNN Classifier

In [75]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=100, n_jobs=4, p=1)
cross_val_scores = cross_validate(model, train_df[feature_cols], train_df[target_cols].values.flatten(), cv=5, scoring=scorings, n_jobs=4, return_estimator=True)

print('Cross-validation scores:')
for scoring, score in cross_val_scores.items():
    if scoring == 'estimator':
        continue
    print(f'{scoring}: {score.mean():.4f} ± {score.std():.4f}')


test_preds = []
train_preds = []

for model in cross_val_scores['estimator']:
    train_preds.append(model.predict_proba(train_df[feature_cols])[:, 1])
    test_preds.append(model.predict_proba(test_df[feature_cols])[:, 1])

test_preds = np.array(test_preds).mean(axis=0)
train_preds = np.array(train_preds).mean(axis=0)

print(f'ROC AUC score on train: {roc_auc_score(train_df[target_cols], train_preds):.4f}')

test_df['knn_rainfall'] = test_preds


Cross-validation scores:
fit_time: 0.0027 ± 0.0001
score_time: 0.0580 ± 0.0023
test_accuracy: 0.8338 ± 0.0191
test_f1: 0.8986 ± 0.0106
test_recall: 0.9764 ± 0.0023
test_precision: 0.8325 ± 0.0172
test_roc_auc: 0.8697 ± 0.0165
ROC AUC score on train: 0.8832


## Trying Ensemble of Predictors Prediction

In [None]:
from scipy.stats import rankdata

best_pub = pd.read_csv('data/best_public.csv')
best_pub['rainfall'] = rankdata(best_pub['rainfall']) / len(best_pub)

ensemble_prediction = 1.07 * best_pub['rainfall'] - 0.005 * test_df['rf_rainfall'] - 0.06* test_df['xgb_rainfall'] - 0.005 * test_df['knn_rainfall']

test_df['rainfall'] = ensemble_prediction

submission = test_df[['id', 'rainfall']]

display(submission.head(20))

submission.to_csv('submissions/submission.csv', index=False)

Unnamed: 0,id,rainfall
0,2190,0.964384
1,2191,0.964384
2,2192,0.964384
3,2193,0.087671
4,2194,0.022603
5,2195,0.723288
6,2196,0.806164
7,2197,0.964384
8,2198,0.884247
9,2199,0.739041


Unnamed: 0,id,rainfall
0,2190,0.962241
1,2191,0.962223
2,2192,0.963196
3,2193,0.090566
4,2194,0.021259
5,2195,0.711182
6,2196,0.793812
7,2197,0.962245
8,2198,0.877114
9,2199,0.722986
