In [37]:
import os
import pandas as pd
import numpy as np
import sys
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import KNNImputer
from typing import List
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
import xgboost as xgb

In [2]:
# Constants

TRAIN_FILE = "data/train.csv"
VAL_FILE = "data/val.csv"
UNLABELED = "data/unlabeled_v2.csv"
TRAIN_REPORT_PATH = "reports/train_planets_report.html"
VAL_REPORT_PATH = "reports/val_planets_report.html"
UNLABELED_REPORT_PATH = "reports/unlabeled_planets_report.html"

# Features
OBJID = "objid"
RA = "ra"
DEC = "dec"
CLEAN = "clean"
ROWC = "rowc"
colc = "colc"
CLASS = "class"

In [18]:
def nan_filler(df: pd.DataFrame, old_nan):
    df.replace(old_nan, np.nan, inplace=True)


def fill_numeric_knn(df: pd.DataFrame, scaler, imputer_params: dict = {"n_neighbors": 5, "metric": "nan_euclidean", "weights": "uniform"})->pd.DataFrame:
    """
    """
    # Scaling
    columns = df.columns
    scl = scaler()
    df = scl.fit_transform(df)
    knn_imputer = KNNImputer(**imputer_params)
    transformed_df = knn_imputer.fit_transform(df)
    out_df = pd.DataFrame(transformed_df)
    out_df.columns = columns
    return out_df


def fill_median(df: pd.DataFrame, columns: List[str]) -> pd.DataFrame:
    """
    """
    return df[columns].fillna(df[columns].median())


def preprocess_train_dataset(file, target_column, drop_columns=None):
    df = pd.read_csv(file)
    y = df[target_column]
    df.drop(target_column, axis=1, inplace=True)
    if drop_columns:
        df.drop(drop_columns, inplace=True, axis=1)
    nan_filler(df, old_nan="na")
    df = fill_median(df, columns=df.columns)
    return df, y


def preprocess_unlabeled_dataset(file, drop_columns=None):
    df = pd.read_csv(file)
    if drop_columns:
        df.drop(drop_columns, inplace=True, axis=1)
    nan_filler(df, old_nan="na")
    df = fill_median(df, columns=df.columns)
    return df

# Preprocess train

In [5]:
train = pd.read_csv(TRAIN_FILE)

In [7]:
nan_filler(train, old_nan="na")

In [8]:
train.drop(OBJID, inplace=True, axis=1)

In [10]:
y_train = train[CLASS]

In [11]:
train.drop(CLASS, axis=1, inplace=True)

In [22]:
train = train.astype(np.float32)

In [23]:
knn_filled_train = fill_numeric_knn(train, scaler=StandardScaler)

In [26]:
knn_filled_train.describe()

Unnamed: 0,ra,dec,u_0,g_0,r_0,i_0,z_0,clean,rowc,colc,...,u_5,g_5,r_5,i_5,z_5,u_6,g_6,r_6,i_6,z_6
count,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,...,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0
mean,0.0,-2.441406e-08,-0.003852,-0.005846,-0.000798,-0.002258,-0.006903,0.0,-1.271566e-09,5.086263e-09,...,-0.001419,-1.4e-05,-0.000999,-0.000704,-0.000299,8.138021e-09,-8.138021e-09,-1.627604e-08,-8.138021e-09,4.06901e-08
std,1.000017,1.000017,0.978777,0.981307,0.981036,0.978175,0.976819,1.000017,1.000017,1.000017,...,0.923281,0.940277,0.925754,0.929968,0.947121,1.000017,1.000017,1.000017,1.000017,1.000017
min,-1.971776,-1.386912,-4.620751,-5.761186,-6.038962,-6.067034,-5.142623,-2.164995,-1.80725,-1.905784,...,-56.100445,-155.062714,-27.730429,-94.3862,-122.925743,-2.931453,-3.198354,-1.670659,-1.685728,-1.703705
25%,-0.894177,-0.5214929,-0.803068,-0.713158,-0.585691,-0.531308,-0.535763,0.461895,-0.8659694,-0.840976,...,-0.007112,0.004596,-0.013081,-0.01811,-0.000385,-0.9285473,-0.8605612,-0.7013367,-0.6724371,-1.106466
50%,0.478989,-0.3976714,0.077681,0.094963,0.124292,0.079644,0.049623,0.461895,0.01834904,0.006253775,...,-0.003851,0.006449,-0.008883,0.000948,0.004601,-0.9285473,-0.2761129,-0.2166758,-0.1657914,0.08801318
75%,0.709084,0.212182,0.739332,0.614741,0.618085,0.689805,0.664417,0.461895,0.8675691,0.8621805,...,-0.000893,0.007995,-0.003576,0.021964,0.009481,1.074359,0.8927838,0.7526462,0.8475001,0.6852525
max,1.643248,2.768572,4.012548,5.078683,6.039103,6.619977,5.446929,0.461895,1.73262,1.819995,...,114.268509,14.439761,135.275635,55.262589,49.572327,1.074359,1.477232,1.721968,1.860791,3.07421


# Training RFC

In [27]:
rfc_params = {
    "n_estimators": 10,
    "criterion": "gini",
    "min_samples_split": 10,
    "max_features": "auto",
    "bootstrap": True,
    "n_jobs": -1
}

In [28]:
rfc = RandomForestClassifier(**rfc_params)

In [31]:
rfc_knn_score = cross_validate(rfc, knn_filled_train, y_train, cv=10, scoring=["f1_micro", "f1_macro", "f1_weighted"], n_jobs=-1)
rfc_knn_score

{'fit_time': array([2.20798612, 2.09253383, 2.18752551, 2.244349  , 2.30721807,
        2.41733861, 2.51049304, 2.37243605, 1.3270843 , 1.19770217]),
 'score_time': array([0.02308583, 0.07537031, 0.02883911, 0.09078646, 0.09395385,
        0.09226966, 0.03365135, 0.02631593, 0.01171017, 0.01118708]),
 'test_f1_micro': array([0.68066667, 0.80566667, 0.70533333, 0.865     , 0.87      ,
        0.85333333, 0.882     , 0.93666667, 0.93566667, 0.82866667]),
 'test_f1_macro': array([0.67055002, 0.80403115, 0.70029941, 0.8636683 , 0.86740684,
        0.84949306, 0.8816836 , 0.93652502, 0.93545677, 0.8304203 ]),
 'test_f1_weighted': array([0.67055002, 0.80403115, 0.70029941, 0.8636683 , 0.86740684,
        0.84949306, 0.8816836 , 0.93652502, 0.93545677, 0.8304203 ])}

In [32]:
print("Averages for KNN filling")
for k, v in rfc_knn_score.items():
    print(k, np.mean(v))

Averages for KNN filling
fit_time 2.086466670036316
score_time 0.04871697425842285
test_f1_micro 0.8362999999999999
test_f1_macro 0.8339534468401137
test_f1_weighted 0.8339534468401137


# Ensemble of SVC models

In [34]:
svc_ensemble = OneVsRestClassifier(SVC())

In [35]:
svce_scores = cross_validate(svc_ensemble, knn_filled_train, y_train, cv=10, scoring=["f1_micro", "f1_macro", "f1_weighted"], n_jobs=-1)
svce_scores

{'fit_time': array([86.68847561, 88.22821164, 89.07271171, 92.23407173, 94.72351384,
        94.01462412, 94.57167006, 94.97331238, 65.78146601, 61.63696432]),
 'score_time': array([6.19693518, 6.53629065, 6.65872669, 6.48659801, 7.04660249,
        6.96389222, 6.88834023, 6.53894424, 4.13366747, 4.24801111]),
 'test_f1_micro': array([0.68133333, 0.75      , 0.77366667, 0.82966667, 0.90433333,
        0.89533333, 0.89433333, 0.89533333, 0.888     , 0.82933333]),
 'test_f1_macro': array([0.65611855, 0.74784338, 0.77651316, 0.82943996, 0.90349537,
        0.89434205, 0.89328074, 0.89513362, 0.88838709, 0.82885983]),
 'test_f1_weighted': array([0.65611855, 0.74784338, 0.77651316, 0.82943996, 0.90349537,
        0.89434205, 0.89328074, 0.89513362, 0.88838709, 0.82885983])}

In [36]:
print("Averages for SVC ensemble")
for k, v in svce_scores.items():
    print(k, np.mean(v))

Averages for SVC ensemble
fit_time 86.19250214099884
score_time 6.16980082988739
test_f1_micro 0.8341333333333333
test_f1_macro 0.8313413757204511
test_f1_weighted 0.8313413757204511


# XGBoost

In [41]:
xgb_params = {
    'random_state': 42,
    'tree_method': 'hist',
    'grow_policy': 'lossguide',
    'n_estimators': 10,
    'eta': 0.02,
    'max_depth': 3,
    'min_child_weight': 1,
    'reg_lambda': 1,
    'max_bin': 120,
    'subsample': 0.9
}

In [42]:
xgbc = xgb.XGBClassifier(**xgb_params)

In [43]:
xgb_scores = cross_validate(xgbc, knn_filled_train, y_train, cv=10, scoring=["f1_micro", "f1_macro", "f1_weighted"], n_jobs=-1)
xgb_scores

{'fit_time': array([19.07109857, 49.57370162, 48.12543678, 53.22042108, 11.111938  ,
        23.04491282, 36.56133103, 18.0635879 , 31.62586641, 13.37819529]),
 'score_time': array([0.12771511, 0.15610981, 0.16035199, 0.1284709 , 0.03177285,
        0.1643877 , 0.09258199, 0.02416635, 0.00777125, 0.09301949]),
 'test_f1_micro': array([0.52633333, 0.80033333, 0.76866667, 0.85166667, 0.899     ,
        0.83833333, 0.854     , 0.907     , 0.91733333, 0.64233333]),
 'test_f1_macro': array([0.47148709, 0.79173778, 0.76208588, 0.85115433, 0.89797192,
        0.8390197 , 0.85475183, 0.90627908, 0.9166888 , 0.64049019]),
 'test_f1_weighted': array([0.47148709, 0.79173778, 0.76208588, 0.85115433, 0.89797192,
        0.8390197 , 0.85475183, 0.90627908, 0.9166888 , 0.64049019])}