In [6]:
import os
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import joblib

pd.set_option("display.max_columns", 100)

In [None]:
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) if "__file__" in globals() else os.path.dirname(os.getcwd())
PROC_DIR = os.path.join(BASE_DIR, "data", "processed")

X_train = pd.read_csv(os.path.join(PROC_DIR, "X_train.csv"))
X_test  = pd.read_csv(os.path.join(PROC_DIR, "X_test.csv"))
y_train = pd.read_csv(os.path.join(PROC_DIR, "y_train.csv")).squeeze("columns")
y_test  = pd.read_csv(os.path.join(PROC_DIR, "y_test.csv")).squeeze("columns")


X_train.head(), y_train.head(), X_train.shape, X_test.shape




(   koi_steff  koi_slogg  koi_srad         ra        dec  koi_kepmag  \
 0     5655.0      4.482     0.804  288.96494  42.469391      14.858   
 1     6424.0      4.186     1.489  285.06335  44.167885      13.623   
 2     6049.0      4.353     1.176  292.71310  38.358212      14.818   
 3     5334.0      4.585     0.796  290.20575  37.769138      17.628   
 4     6273.0      4.442     1.044  288.02271  43.548210      15.378   
 
    koi_period  koi_time0bk  koi_duration  koi_impact  koi_depth  \
 0    4.238856   135.201464        5.2235       1.031    68002.0   
 1    1.187352   133.465454        4.9990       1.222    20703.0   
 2   13.229820   134.375700        3.5660       0.435      157.8   
 3   18.798622   138.028268        7.4782       0.524   380200.0   
 4   17.794005   365.346660        4.0910       3.720      879.4   
 
    koi_model_snr  koi_prad  koi_teq  koi_insol  
 0          374.8     38.01   1044.0     280.82  
 1          717.4     67.35   2251.0    6067.30  
 2    

In [8]:
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc  = le.transform(y_test)

# mapowanie klas (przyda się do odczytu wyniku)
label_map = {i: c for i, c in enumerate(le.classes_)}
pd.Series(label_map)


0         CANDIDATE
1         CONFIRMED
2    FALSE POSITIVE
dtype: object

In [9]:
rf = RandomForestClassifier(
    n_estimators=400,
    max_depth=None,
    min_samples_leaf=2,
    n_jobs=-1,
    class_weight="balanced",  # na wypadek niezbalansowanych klas
    random_state=42
)

rf.fit(X_train, y_train_enc)
pred_enc = rf.predict(X_test)
print(classification_report(y_test_enc, pred_enc, target_names=le.classes_))
print("Macierz pomyłek:\n", confusion_matrix(y_test_enc, pred_enc))


                precision    recall  f1-score   support

     CANDIDATE       0.57      0.60      0.58       375
     CONFIRMED       0.83      0.84      0.84       549
FALSE POSITIVE       0.84      0.82      0.83       916

      accuracy                           0.78      1840
     macro avg       0.75      0.75      0.75      1840
  weighted avg       0.78      0.78      0.78      1840

Macierz pomyłek:
 [[225  44 106]
 [ 52 461  36]
 [120  49 747]]


In [10]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(rf, X_train, y_train_enc, cv=cv, scoring="f1_macro", n_jobs=-1)
print("CV f1_macro:", scores.round(3), "→ mean:", scores.mean().round(3))


CV f1_macro: [0.751 0.747 0.752 0.751 0.765] → mean: 0.753


In [11]:
MODEL_DIR = os.path.join(BASE_DIR, "data")
MODEL_PATH = os.path.join(MODEL_DIR, "rf_koi_model.joblib")
ENC_PATH   = os.path.join(MODEL_DIR, "label_encoder.joblib")

joblib.dump(rf, MODEL_PATH)
joblib.dump(le, ENC_PATH)
MODEL_PATH, ENC_PATH


('c:\\Users\\nruey\\Desktop\\hackaton\\data\\rf_koi_model.joblib',
 'c:\\Users\\nruey\\Desktop\\hackaton\\data\\label_encoder.joblib')