In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import mode

In [2]:
train = pd.read_csv('randomforest/train_processed.csv')
test = pd.read_csv('randomforest/test_processed.csv')

In [3]:
X_train = train.drop(['id','day','rainfall'], axis=1)
Y_train = train['rainfall']
X_test = test.drop(['id','day'], axis=1)

In [5]:
# finding the best max_depth
cv = KFold(n_splits=5, shuffle=True, random_state=42)
accs = []
depth_range = range(1, 22)

for depth in depth_range:
    fold_accuracy = []
    rand_clf = RandomForestClassifier(criterion='entropy', max_depth = depth)
    
    for t_idx, v_idx in cv.split(X_train):
        X_train_cv, X_valid_cv = X_train.iloc[t_idx], X_train.iloc[v_idx]
        Y_train_cv, Y_valid_cv = Y_train.iloc[t_idx], Y_train.iloc[v_idx]

        model = rand_clf.fit(X_train_cv, Y_train_cv)
        acc = model.score(X_valid_cv, Y_valid_cv)
        fold_accuracy.append(acc)

    avg = np.mean(fold_accuracy)
    accs.append(avg)


df = pd.DataFrame({"Max Depth": depth_range, "Average Accuracy": accs})
print(df)

    Max Depth  Average Accuracy
0           1          0.825152
1           2          0.822424
2           3          0.831212
3           4          0.839394
4           5          0.845455
5           6          0.853333
6           7          0.856667
7           8          0.861818
8           9          0.871515
9          10          0.878182
10         11          0.885152
11         12          0.886364
12         13          0.889394
13         14          0.893030
14         15          0.890606
15         16          0.891818
16         17          0.895152
17         18          0.892424
18         19          0.896061
19         20          0.899697
20         21          0.893333


In [21]:
sorted_df = df.sort_values(by='Average Accuracy', ascending=False)
print(sorted_df)

    Max Depth  Average Accuracy
19         20          0.899697
18         19          0.896061
16         17          0.895152
20         21          0.893333
13         14          0.893030
17         18          0.892424
15         16          0.891818
14         15          0.890606
12         13          0.889394
11         12          0.886364
10         11          0.885152
9          10          0.878182
8           9          0.871515
7           8          0.861818
6           7          0.856667
5           6          0.853333
4           5          0.845455
3           4          0.839394
2           3          0.831212
0           1          0.825152
1           2          0.822424


In [15]:
# Divided the range (1,22) into intervals (ex: 1-3, 4-6, 7-9, ...) then selected
# the model with best accuracy from each intervals

depth_candidates = [3,6,9,12,14,17,20] 

# or just choose top 5 candidates with the most accuracy
# depth_candidates = [20, 19, 17, 21, 14] 

# hard voting (vote for 0 or 1)

predictions = []

for depth in depth_candidates:
    model = RandomForestClassifier(criterion='entropy', max_depth=depth, random_state=42)
    model.fit(X_train, Y_train)
    pred = model.predict(X_test)  # X_test는 테스트 데이터
    predictions.append(pred)

predictions = np.array(predictions)  # (n_models, n_samples)

voted_preds, _ = mode(predictions, axis=0)
voted_preds = voted_preds.flatten()

In [12]:
# soft voting (average percentage of candidates)

probs = []

for depth in depth_candidates:
    model = RandomForestClassifier(criterion='entropy', max_depth=depth, random_state=42)
    model.fit(X_train, Y_train)
    proba = model.predict_proba(X_test)  # (n_samples, n_classes)
    probs.append(proba)
    
avg_proba = np.mean(probs, axis=0)  # (n_samples, n_classes)
rainfall_prob = avg_proba[:, 1]

In [14]:
submission = test[['id']].copy()
submission['rainfall'] = rainfall_prob
submission.to_csv('submission_with_prob.csv', index=False)

In [10]:
submission_hv = test[['id']].copy()
submission_hv['rainfall'] = voted_preds
submission_hv.to_csv('rf_hard_voting_submission.csv', index=False)