In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
import xgboost as xgb
from sklearn.metrics import precision_recall_curve


In [30]:
test = pd.read_csv('./playground-series-s5e3/test.csv')
train = pd.read_csv('./playground-series-s5e3/train.csv')

In [4]:
train

Unnamed: 0,id,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall
0,0,1,1017.4,21.2,20.6,19.9,19.4,87.0,88.0,1.1,60.0,17.2,1
1,1,2,1019.5,16.2,16.9,15.8,15.4,95.0,91.0,0.0,50.0,21.9,1
2,2,3,1024.1,19.4,16.1,14.6,9.3,75.0,47.0,8.3,70.0,18.1,1
3,3,4,1013.4,18.1,17.8,16.9,16.8,95.0,95.0,0.0,60.0,35.6,1
4,4,5,1021.8,21.3,18.4,15.2,9.6,52.0,45.0,3.6,40.0,24.8,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2185,2185,361,1014.6,23.2,20.6,19.1,19.9,97.0,88.0,0.1,40.0,22.1,1
2186,2186,362,1012.4,17.2,17.3,16.3,15.3,91.0,88.0,0.0,50.0,35.3,1
2187,2187,363,1013.3,19.0,16.3,14.3,12.6,79.0,79.0,5.0,40.0,32.9,1
2188,2188,364,1022.3,16.4,15.2,13.8,14.7,92.0,93.0,0.1,40.0,18.0,1


In [5]:
X = train.drop(['id','rainfall'],axis=1)
y = train['rainfall']

In [20]:
X_train , X_test , y_train ,y_test = train_test_split(X,y,random_state=42,test_size=0.2)
y_test.shape

(438,)

In [7]:
Scaler = StandardScaler()
X_train_scaled = Scaler.fit_transform(X_train)
X_test_scaled = Scaler.fit_transform(X_test)

In [12]:
model_1 = RandomForestClassifier(n_estimators=100,random_state=42)

In [9]:
# hyper parameter tuning
param_grid = {
    'n_estimators': [100, 200],      
    'max_depth': [3, 5, 7],            
    'learning_rate': [0.01, 0.1, 0.2],  
    'subsample': [0.8, 1.0],           
    'colsample_bytree': [0.8, 1.0],     
    'gamma': [0, 0.1, 0.2]              
}

model = xgb.XGBClassifier(
    objective='binary:logistic', 
    eval_metric='logloss',
    random_state=42,
    n_jobs=5 # use 5 cores of cpu
)

model = GridSearchCV(
    estimator=model ,
    param_grid=param_grid,
    scoring='f1', # scoring each iteration by f1 score         
    cv=5,                 
    n_jobs=5,
    verbose= 2 # show result while train
)

model.fit(X_train_scaled, y_train)

best_model = model.best_estimator_
print("Best Parameters:", model.best_params_)
print("Best F1-Score:", model.best_score_)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0; total time=   0.0s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0; total time=   0.0s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, m

In [10]:
y_pred = best_model.predict(X_test_scaled)
print(f'accuracy:{accuracy_score(y_test,y_pred)}')
print(f'F1-score:{f1_score(y_test,y_pred)}')

accuracy:0.8493150684931506
F1-score:0.9011976047904192


In [27]:
# predict on test set
X_test_set = test.drop(['id'],axis=1)
X_test_set_scaled = Scaler.fit_transform(X_test_set)
prob = best_model.predict_proba(X_test_set_scaled)[:, 1]

In [28]:
#Find best threshold by precision_recall_curve
y_test_prob = best_model.predict_proba(X_test)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, y_test_prob)
f1_scores = (2 * precision * recall) / (precision + recall + 1e-9)
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]
best_threshold 

np.float32(0.8645564)

In [29]:
predictions = (prob >= best_threshold ).astype(int)
submission = pd.DataFrame({"id": test["id"], "rainfall": predictions})
submission.to_csv("submission_xgboost_2.csv", index=False)

np.float32(0.8645564)