In [12]:
from pathlib import Path

import pandas as pd
from joblib import dump, load
from sklearn.ensemble import AdaBoostClassifier
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.linear_model import HuberRegressor

In [13]:
basedir = Path("./dataset")
train = pd.read_csv(basedir / "train.csv")
test = pd.read_csv(basedir / "test.csv")
submission = pd.read_csv(basedir / "sample_submission.csv")


In [14]:
# Used to predict the missing value of measurement_17
fill_dict = {
    "A": ["measurement_5", "measurement_6", "measurement_8"],
    "B": ["measurement_4", "measurement_5", "measurement_7"],
    "C": ["measurement_5", "measurement_7", "measurement_8", "measurement_9"],
    "D": ["measurement_5", "measurement_6", "measurement_7", "measurement_8"],
    "E": ["measurement_4", "measurement_5", "measurement_6", "measurement_8"],
    "F": ["measurement_4", "measurement_5", "measurement_6", "measurement_7"],
    "G": ["measurement_4", "measurement_6", "measurement_8", "measurement_9"],
    "H": [
        "measurement_4",
        "measurement_5",
        "measurement_7",
        "measurement_8",
        "measurement_9",
    ],
    "I": ["measurement_3", "measurement_7", "measurement_8"],
}

feature = [f for f in test.columns if f.startswith("measurement") or f == "loading"]

In [15]:
def without_any_null(data: pd.DataFrame):
    return data.isna().sum(axis=1) == 0

def only_target_null(data: pd.Series, feats: list[str], target: str):
    return without_any_null(data[feats]) & data[target].isna()

In [16]:
def fill_column(data: pd.DataFrame, column_name: str) -> pd.DataFrame:
    for code in data.product_code.unique():
        code_data = data[data.product_code == code]
        correlation_col = fill_dict[code]

        # rows with no empty column
        train_fill = code_data[correlation_col + [column_name]].dropna(how="any")
        fill_rows_code = only_target_null(code_data, correlation_col, column_name)
        fill_data = code_data[fill_rows_code]

        model_fill = HuberRegressor()
        model_fill.fit(train_fill[correlation_col], train_fill[column_name])

        # index in data (global)
        fill_rows_data = (data.product_code == code) & only_target_null(
            data, correlation_col, column_name
        )
        data.loc[fill_rows_data, column_name] = model_fill.predict(
            fill_data[correlation_col]
        )

        return data

In [17]:
def impute_data(data: pd.DataFrame):
    for code in data.product_code.unique():
        model_impute = KNNImputer(n_neighbors=5)
        # print(f"KNN imputing code {code}")
        data.loc[(data.product_code == code), feature] = model_impute.fit_transform(
            data.loc[(data.product_code == code), feature]
        )
    return data

In [18]:
def preprocess_data():
    data = pd.concat([train, test])
    data["m3_missing"] = data["measurement_3"].isnull().astype(np.int8)
    data["m5_missing"] = data["measurement_5"].isnull().astype(np.int8)
    data["measurement_2"] = data["measurement_2"].clip(15)
    data["area"] = data["attribute_2"].values * data["attribute_3"].values
    data["loading"] = np.log1p(data["loading"])
    data = fill_column(data, "measurement_17")
    data = impute_data(data)
    return data

In [19]:
def scale(
    train_data: pd.DataFrame,
    val_data: pd.DataFrame,
    test_data: pd.DataFrame,
    feats: list,
):
    scaler = StandardScaler()

    scaled_train = scaler.fit_transform(train_data[feats])
    scaled_val = scaler.transform(val_data[feats])
    scaled_test = scaler.transform(test_data[feats])

    new_train = train_data.copy()
    new_val = val_data.copy()
    new_test = test_data.copy()

    new_train[feats] = scaled_train
    new_val[feats] = scaled_val
    new_test[feats] = scaled_test

    return new_train, new_val, new_test

In [20]:
data = preprocess_data()
copy_train = data[data.failure.notnull()]
copy_test = data[data.failure.isnull()]

In [21]:
feats = ["loading", "measurement_17", "m3_missing", "m5_missing"]

In [None]:
save_path = Path("./model")
model_files = list(save_path.glob("*joblib"))

lr_test = np.zeros(len(copy_test))
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
X = copy_train.drop(["failure"], axis=1)
y = copy_train["failure"].astype(int)
test = copy_test.drop(["failure"], axis=1)
for fold_idx, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    print(f"Fold: {fold_idx}")
    x_train, x_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    x_test = test.copy()

    x_train, x_val, x_test = scale(x_train, x_val, x_test, feats)

    model: AdaBoostClassifier = load(model_files[fold_idx])
    lr_test += model.predict_proba(x_test[feats])[:, 1] / 5

In [23]:
submission['failure'] = lr_test
submission[['id', 'failure']].to_csv('109550050.csv', index=False)