In [35]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_validate
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import KNNImputer
from typing import List
from sklearn.preprocessing import StandardScaler
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt

In [2]:
# Constants

TRAIN_FILE = "data/train.csv"
VAL_FILE = "data/val.csv"
UNLABELED = "data/unlabeled_v2.csv"
TRAIN_REPORT_PATH = "reports/train_planets_report.html"
VAL_REPORT_PATH = "reports/val_planets_report.html"
UNLABELED_REPORT_PATH = "reports/unlabeled_planets_report.html"

# Features
OBJID = "objid"
RA = "ra"
DEC = "dec"
CLEAN = "clean"
ROWC = "rowc"
colc = "colc"
CLASS = "class"

In [3]:
train = pd.read_csv(TRAIN_FILE)

In [4]:
train.describe()

Unnamed: 0,objid,ra,dec,clean,rowc,colc,u_1,g_1,r_1,i_1,...,g_2,r_2,i_2,z_2,u_6,g_6,r_6,i_6,z_6,class
count,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,...,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0
mean,61099.655167,121.22025,5.578351,0.824167,753.527731,1047.251168,759.559275,764.900706,753.525249,756.695181,...,1048.346558,1047.275525,1046.680061,1046.565899,4.3908,5.472433,4.447067,4.327233,2.852633,1.0
std,22536.166952,61.465218,12.130847,0.380685,393.330223,545.098374,393.297413,393.314213,393.324841,393.32968,...,545.404399,545.079386,545.011597,545.210755,1.497849,1.711044,2.063332,1.973799,1.674398,0.81651
min,13209.0,0.026645,-11.245781,0.0,42.6935,8.428511,0.503521,56.2531,42.6935,46.127,...,8.537188,8.428511,6.421341,5.819234,0.0,0.0,1.0,1.0,0.0,0.0
25%,41662.5,66.260382,-0.747695,1.0,412.921475,588.844125,419.01975,424.172175,412.921475,416.167025,...,589.48905,588.86855,588.2171,588.01845,3.0,4.0,3.0,3.0,1.0,0.0
50%,61226.0,150.660919,0.75434,1.0,760.74485,1050.66,766.4302,771.73105,760.64165,764.01645,...,1052.516,1050.66,1050.4745,1050.345,3.0,5.0,4.0,4.0,3.0,1.0
75%,80523.0,164.803546,8.152256,1.0,1094.76325,1517.2165,1100.5565,1106.3355,1094.76325,1097.7805,...,1518.5055,1517.21175,1516.216,1516.13175,6.0,7.0,6.0,6.0,4.0,2.0
max,99997.0,222.221149,39.162917,1.0,1435.008,2039.311,1440.699,1447.814,1435.008,1437.885,...,2042.44,2039.311,2042.557,2041.264,6.0,8.0,8.0,8.0,8.0,2.0


In [4]:
val = pd.read_csv(VAL_FILE)
val.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23333 entries, 0 to 23332
Data columns (total 44 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   objid   23333 non-null  int64  
 1   ra      23333 non-null  float64
 2   dec     23333 non-null  float64
 3   u_0     23333 non-null  object 
 4   g_0     23333 non-null  object 
 5   r_0     23333 non-null  object 
 6   i_0     23333 non-null  object 
 7   z_0     23333 non-null  object 
 8   clean   23333 non-null  int64  
 9   rowc    23333 non-null  float64
 10  colc    23333 non-null  float64
 11  rowv    23333 non-null  object 
 12  colv    23333 non-null  object 
 13  u_1     23333 non-null  float64
 14  g_1     23333 non-null  float64
 15  r_1     23333 non-null  float64
 16  i_1     23333 non-null  float64
 17  z_1     23333 non-null  float64
 18  u_2     23333 non-null  float64
 19  g_2     23333 non-null  float64
 20  r_2     23333 non-null  float64
 21  i_2     23333 non-null  float64
 22

In [6]:
val.describe()

Unnamed: 0,objid,ra,dec,clean,rowc,colc,u_1,g_1,r_1,i_1,...,g_2,r_2,i_2,z_2,u_6,g_6,r_6,i_6,z_6,class
count,23333.0,23333.0,23333.0,23333.0,23333.0,23333.0,23333.0,23333.0,23333.0,23333.0,...,23333.0,23333.0,23333.0,23333.0,23333.0,23333.0,23333.0,23333.0,23333.0,23333.0
mean,42863.978957,216.980562,31.084987,0.777997,741.844317,1078.465604,748.009178,753.269122,741.854703,745.072987,...,1077.435365,1078.485428,1077.549951,1076.907735,4.181974,5.089401,4.346891,4.076844,3.140745,0.642866
std,11264.13762,71.180074,25.173756,0.415602,390.766586,547.771683,390.770032,390.775528,390.767492,390.771065,...,548.346136,547.773619,547.735873,548.029707,1.469084,1.651456,2.051583,1.939502,1.588384,0.811292
min,849.0,71.105443,-11.092428,0.0,52.26433,65.65462,57.21036,61.03793,52.26433,54.89461,...,54.77028,65.65462,64.3897,59.41727,0.0,0.0,0.0,0.0,0.0,0.0
25%,33503.0,167.091126,4.514675,1.0,405.5805,614.33,411.2023,416.5133,405.5805,408.5551,...,612.6823,614.33,613.4899,612.4222,3.0,4.0,3.0,3.0,2.0,0.0
50%,43025.0,202.867668,40.321172,1.0,738.171,1102.78,744.5613,749.7141,738.171,741.7594,...,1100.71,1102.78,1101.917,1101.055,3.0,5.0,4.0,4.0,3.0,0.0
75%,52431.0,238.045655,51.124605,1.0,1081.838,1552.282,1088.137,1093.117,1081.794,1085.058,...,1551.778,1552.282,1551.211,1550.669,6.0,6.0,6.0,6.0,4.0,1.0
max,63296.0,359.997573,76.059262,1.0,1438.156,2046.515,1444.381,1449.539,1438.156,1441.835,...,2050.65,2046.515,2046.654,2049.122,6.0,8.0,8.0,8.0,8.0,2.0


In [5]:
val.isna().sum()

objid    0
ra       0
dec      0
u_0      0
g_0      0
r_0      0
i_0      0
z_0      0
clean    0
rowc     0
colc     0
rowv     0
colv     0
u_1      0
g_1      0
r_1      0
i_1      0
z_1      0
u_2      0
g_2      0
r_2      0
i_2      0
z_2      0
u_3      0
g_3      0
r_3      0
i_3      0
z_3      0
u_4      0
g_4      0
r_4      0
i_4      0
z_4      0
u_5      0
g_5      0
r_5      0
i_5      0
z_5      0
u_6      0
g_6      0
r_6      0
i_6      0
z_6      0
class    0
dtype: int64

In [6]:
# seems like 'na' stand for NaN and pandas does not recognize it like a missed value
def nan_filler(df: pd.DataFrame, old_nan):
    df.replace(old_nan, np.nan, inplace=True)

In [7]:
nan_filler(train, "na")
nan_filler(val, "na")

In [8]:
unlabeled = pd.read_csv(UNLABELED)
nan_filler(unlabeled, "na")

In [9]:
# check missed values
print("Train NAN\n", train.isna().sum(), "\n")
print("Val NaN\n", val.isna().sum(), "\n")
print("Unlabeled NaN\n", unlabeled.isna().sum())

Train NAN
 objid       0
ra          0
dec         0
u_0      4556
g_0      4504
r_0      4555
i_0      4596
z_0      4481
clean       0
rowc        0
colc        0
rowv      401
colv      401
u_1         0
g_1         0
r_1         0
i_1         0
z_1         0
u_2         0
g_2         0
r_2         0
i_2         0
z_2         0
u_3      5173
g_3      1720
r_3       727
i_3       580
z_3      1728
u_4      4474
g_4      4506
r_4      4484
i_4      4517
z_4      4422
u_5      4482
g_5      4451
r_5      4498
i_5      4489
z_5      4509
u_6         0
g_6         0
r_6         0
i_6         0
z_6         0
class       0
dtype: int64 

Val NaN
 objid       0
ra          0
dec         0
u_0      3455
g_0      3460
r_0      3380
i_0      3449
z_0      3499
clean       0
rowc        0
colc        0
rowv      681
colv      681
u_1         0
g_1         0
r_1         0
i_1         0
z_1         0
u_2         0
g_2         0
r_2         0
i_2         0
z_2         0
u_3      4836
g_3      1690

In [12]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 44 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   objid   30000 non-null  int64  
 1   ra      30000 non-null  float64
 2   dec     30000 non-null  float64
 3   u_0     25444 non-null  object 
 4   g_0     25496 non-null  object 
 5   r_0     25445 non-null  object 
 6   i_0     25404 non-null  object 
 7   z_0     25519 non-null  object 
 8   clean   30000 non-null  int64  
 9   rowc    30000 non-null  float64
 10  colc    30000 non-null  float64
 11  rowv    29599 non-null  object 
 12  colv    29599 non-null  object 
 13  u_1     30000 non-null  float64
 14  g_1     30000 non-null  float64
 15  r_1     30000 non-null  float64
 16  i_1     30000 non-null  float64
 17  z_1     30000 non-null  float64
 18  u_2     30000 non-null  float64
 19  g_2     30000 non-null  float64
 20  r_2     30000 non-null  float64
 21  i_2     30000 non-null  float64
 22

In [13]:
# All features must be real or binary

In [10]:
train = train.astype(np.float32)
val = val.astype(np.float32)
unlabeled = unlabeled.astype(np.float32)

# Imputers

In [11]:
def fill_numeric_knn(df: pd.DataFrame, imputer_params:dict = {"n_neighbors": 5, "metric": "nan_euclidean", "weights": "uniform"})->pd.DataFrame:
    """
    """
    knn_imputer = KNNImputer(**imputer_params)
    transformed_df = knn_imputer.fit_transform(df)
    out_df = pd.DataFrame(transformed_df)
    out_df.columns = df.columns
    return out_df

def fill_median(df: pd.DataFrame, columns: List[str])->pd.DataFrame:
    """
    """
    return df[columns].fillna(df[columns].median())
    

    

In [12]:
filled_median_train = fill_median(train, train.columns)

In [13]:
filled_knn_train = fill_numeric_knn(train)

In [14]:
y_train = train[CLASS]


In [15]:
filled_median_train.drop([CLASS], inplace=True, axis=1)


In [46]:
filled_knn_train.drop([CLASS], inplace=True, axis=1)

# Training
# Random Forest Classifier

In [16]:
rfc_params = {
    "n_estimators": 10,
    "criterion": "gini",
    "min_samples_split": 10,
    "max_features": "auto",
    "bootstrap": True,
    "n_jobs": -1
}

In [26]:
rfc = RandomForestClassifier(**rfc_params)
rfc = make_pipeline(StandardScaler(), rfc)

In [27]:
rfc_median_scores = cross_validate(rfc, filled_median_train, y_train, cv=10, scoring=["f1_micro", "f1_macro", "f1_weighted"], n_jobs=-1)
rfc_median_scores

{'fit_time': array([2.18210149, 2.21701384, 2.03900766, 2.09469986, 2.21684122,
        2.20258689, 2.20193887, 2.11784697, 1.08286643, 1.11757803]),
 'score_time': array([0.02448487, 0.03026128, 0.07103658, 0.03098369, 0.02466536,
        0.03347731, 0.02075267, 0.02546525, 0.0139327 , 0.01195264]),
 'test_f1_micro': array([0.70033333, 0.77633333, 0.73633333, 0.857     , 0.83933333,
        0.85566667, 0.87933333, 0.94533333, 0.947     , 0.82233333]),
 'test_f1_macro': array([0.69206908, 0.77332025, 0.73261296, 0.85460929, 0.83359303,
        0.85235511, 0.87862593, 0.94510566, 0.94680779, 0.82373609]),
 'test_f1_weighted': array([0.69206908, 0.77332025, 0.73261296, 0.85460929, 0.83359303,
        0.85235511, 0.87862593, 0.94510566, 0.94680779, 0.82373609])}

In [28]:
rfc_knn_score = cross_validate(rfc, filled_knn_train, y_train, cv=10, scoring=["f1_micro", "f1_macro", "f1_weighted"], n_jobs=-1)
rfc_knn_score

{'fit_time': array([1.28438544, 1.24489284, 1.33096337, 1.31504607, 1.24506521,
        1.24300528, 1.28200698, 1.23270082, 0.73749804, 0.73375869]),
 'score_time': array([0.02903032, 0.053936  , 0.04514813, 0.01926208, 0.0563848 ,
        0.03276253, 0.01933241, 0.0212791 , 0.01250267, 0.01352692]),
 'test_f1_micro': array([0.99466667, 0.999     , 0.999     , 0.99933333, 1.        ,
        1.        , 1.        , 1.        , 1.        , 0.99066667]),
 'test_f1_macro': array([0.99466386, 0.999     , 0.99900017, 0.99933333, 1.        ,
        1.        , 1.        , 1.        , 1.        , 0.99066604]),
 'test_f1_weighted': array([0.99466386, 0.999     , 0.99900017, 0.99933333, 1.        ,
        1.        , 1.        , 1.        , 1.        , 0.99066604])}

In [29]:
print("Averages for median filling")
for k, v in rfc_median_scores.items():
    print(k, np.mean(v))

Averages for median filling
fit_time 1.9472481250762939
score_time 0.028701233863830566
test_f1_micro 0.8359
test_f1_macro 0.8332835185480473
test_f1_weighted 0.8332835185480475


In [30]:
print("Averages for KNN filling")
for k, v in rfc_knn_score.items():
    print(k, np.mean(v))

Averages for KNN filling
fit_time 1.1649322748184203
score_time 0.030316495895385744
test_f1_micro 0.9982666666666666
test_f1_macro 0.9982663389702667
test_f1_weighted 0.9982663389702667


# Probability calibration

In [31]:
calibrated_rfc = CalibratedClassifierCV(base_estimator=rfc, cv=5)

In [32]:
calibrated_rfc.fit(filled_median_train, y_train)

CalibratedClassifierCV(base_estimator=Pipeline(steps=[('standardscaler',
                                                       StandardScaler()),
                                                      ('randomforestclassifier',
                                                       RandomForestClassifier(min_samples_split=10,
                                                                              n_estimators=10,
                                                                              n_jobs=-1))]),
                       cv=5)

In [34]:
y_prob = calibrated_rfc.predict_proba(filled_median_train)

In [39]:
# prob_true, prob_pred = calibration_curve(y_train[:, 1], y_prob[:, 1])

In [43]:
y_train

0        0.0
1        0.0
2        0.0
3        0.0
4        0.0
        ... 
29995    2.0
29996    2.0
29997    2.0
29998    2.0
29999    2.0
Name: class, Length: 30000, dtype: float32