In [22]:
from pathlib import Path

import numpy as np
import pandas as pd
from colorama import Back, Fore, Style
from sklearn.ensemble import AdaBoostClassifier
from sklearn.impute import KNNImputer
from sklearn.linear_model import HuberRegressor, LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler

### Read Data From CSV

In [23]:
basedir = Path("./dataset")
train = pd.read_csv(basedir / "train.csv")
test = pd.read_csv(basedir / "test.csv")
submission = pd.read_csv(basedir / "sample_submission.csv")

In [None]:
print(f"{train.shape=}", f"{test.shape=}")
print(f"failure 0 : {train[train.failure == 0].shape[0]}")
print(f"failure 1 : {train[train.failure == 1].shape[0]}")

### Preprocess Data

#### Global Variables

In [25]:
# Used to predict the missing value of measurement_17
fill_dict = {
    "A": ["measurement_5", "measurement_6", "measurement_8"],
    "B": ["measurement_4", "measurement_5", "measurement_7"],
    "C": ["measurement_5", "measurement_7", "measurement_8", "measurement_9"],
    "D": ["measurement_5", "measurement_6", "measurement_7", "measurement_8"],
    "E": ["measurement_4", "measurement_5", "measurement_6", "measurement_8"],
    "F": ["measurement_4", "measurement_5", "measurement_6", "measurement_7"],
    "G": ["measurement_4", "measurement_6", "measurement_8", "measurement_9"],
    "H": [
        "measurement_4",
        "measurement_5",
        "measurement_7",
        "measurement_8",
        "measurement_9",
    ],
    "I": ["measurement_3", "measurement_7", "measurement_8"],
}

feature = [f for f in test.columns if f.startswith("measurement") or f == "loading"]

#### Preprocess Functions

In [26]:
def without_any_null(data: pd.DataFrame):
    return data.isna().sum(axis=1) == 0

def only_target_null(data: pd.Series, feats: list, target: str):
    return without_any_null(data[feats]) & data[target].isna()

In [27]:
def fill_column(data: pd.DataFrame, column_name: str) -> pd.DataFrame:
    for code in data.product_code.unique():
        code_data = data[data.product_code == code]
        correlation_col = fill_dict[code]

        # rows with no empty column
        train_fill = code_data[correlation_col + [column_name]].dropna(how="any")
        fill_rows_code = only_target_null(code_data, correlation_col, column_name)
        fill_data = code_data[fill_rows_code]

        model_fill = HuberRegressor()
        model_fill.fit(train_fill[correlation_col], train_fill[column_name])

        # index in data (global)
        fill_rows_data = (data.product_code == code) & only_target_null(
            data, correlation_col, column_name
        )
        data.loc[fill_rows_data, column_name] = model_fill.predict(
            fill_data[correlation_col]
        )

        return data

In [28]:
def impute_data(data: pd.DataFrame):
    for code in data.product_code.unique():
        model_impute = KNNImputer(n_neighbors=5)
        # print(f"KNN imputing code {code}")
        data.loc[(data.product_code == code), feature] = model_impute.fit_transform(
            data.loc[(data.product_code == code), feature]
        )
    return data

In [29]:
def preprocess_data():
    data = pd.concat([train, test])
    data["m3_missing"] = data["measurement_3"].isnull().astype(np.int8)
    data["m5_missing"] = data["measurement_5"].isnull().astype(np.int8)
    data["measurement_2"] = data["measurement_2"].clip(15)
    data["area"] = data["attribute_2"].values * data["attribute_3"].values
    data["loading"] = np.log1p(data["loading"])
    data = fill_column(data, "measurement_17")
    data = impute_data(data)
    return data

In [30]:
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler

In [31]:
def scale(
    train_data: pd.DataFrame,
    val_data: pd.DataFrame,
    test_data: pd.DataFrame,
    feats: list,
):
    scaler = MinMaxScaler()

    scaled_train = scaler.fit_transform(train_data[feats])
    scaled_val = scaler.transform(val_data[feats])
    scaled_test = scaler.transform(test_data[feats])

    new_train = train_data.copy()
    new_val = val_data.copy()
    new_test = test_data.copy()

    new_train[feats] = scaled_train
    new_val[feats] = scaled_val
    new_test[feats] = scaled_test

    return new_train, new_val, new_test

In [32]:
data = preprocess_data()
copy_train = data[data.failure.notnull()]
copy_test = data[data.failure.isnull()]
# print(train.shape, test.shape)

### Train Model

#### StratifiedKFold

In [33]:
from joblib import dump, load
import pickle

In [34]:
save_path = Path("./model")

In [35]:
def train_model(feats):
    X = copy_train.drop(["failure"], axis=1)
    y = copy_train["failure"].astype(int)
    test = copy_test.drop(["failure"], axis=1)

    lr_oof = np.zeros(len(X))
    lr_test = np.zeros(len(test))
    lr_auc = 0
    importance_list = []

    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    for fold_idx, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        print(f"Fold: {fold_idx}")
        x_train, x_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        x_test = test.copy()

        x_train, x_val, x_test = scale(x_train, x_val, x_test, feats)

        model = LogisticRegression(
            max_iter=1000, penalty="l2", solver="newton-cg", random_state=42
        )
        model = AdaBoostClassifier(model, n_estimators=2, learning_rate=0.05)
        
        model.fit(x_train[feats], y_train)
        dump(model, save_path / f"model_{fold_idx}.joblib")

        val_preds = model.predict_proba(x_val[feats])[:, 1]
        lr_auc += roc_auc_score(y_val, val_preds) / 5
        lr_test += model.predict_proba(x_test[feats])[:, 1] / 5
        lr_oof[val_idx] = val_preds

    print(
        f"{Fore.GREEN}{Style.BRIGHT}Average auc = {round(lr_auc, 5)}"
    )
    print(
        f"{Fore.RED}{Style.BRIGHT}OOF auc = {round(roc_auc_score(y, lr_oof), 5)}"
    )
    return lr_test

In [36]:
feature0 = ["loading", "measurement_17"]
feature1 = ["loading", "measurement_17", "m3_missing", "m5_missing"]
feature2 = [
    "m3_missing",
    "m5_missing",
    "measurement_1",
    "measurement_2",
    "loading",
    "measurement_17",
]
feature3 = [
    "m3_missing",
    "m5_missing",
    "measurement_2",
    "measurement_3",
    "loading",
    "measurement_17",
    "area",
]

In [None]:
# lr = train(feature0)
lr = train_model(feature1)
# lr = train_model(feature2)
# lr = train(feature3)

In [38]:
submission['failure'] = lr
submission[['id', 'failure']].to_csv('109550050.csv', index=False)

### Other Findings

#### PCA

In [39]:
# from sklearn.decomposition import PCA

# new_feature = [f for f in test.columns if f.startswith("measurement") and f != "measurement_17"]
# extract = data[new_feature]

# pca = PCA(n_components=1)
# new_col = pca.fit_transform(extract)

# data["c1"] = new_col[:, 0]
# copy_train = data[data.failure.notnull()]
# copy_test = data[data.failure.isnull()]

#### Manual Folds

In [40]:
# folds = [
#     [["C", "D", "E"], ["A", "B"]],
#     [["B", "D", "E"], ["A", "C"]],
#     [["B", "C", "E"], ["A", "D"]],
#     [["B", "C", "D"], ["A", "E"]],
#     [["A", "D", "E"], ["B", "C"]],
#     [["A", "C", "E"], ["B", "D"]],
#     [["A", "C", "D"], ["B", "E"]],
#     [["A", "B", "E"], ["C", "D"]],
#     [["A", "B", "D"], ["C", "E"]],
#     [["A", "B", "C"], ["D", "E"]],
# ]

In [41]:
# lr_oof_1 = np.zeros(len(train))
# lr_oof_2 = np.zeros(len(train))
# lr_test = np.zeros(len(test))
# lr_auc = 0
# lr_acc = 0
# importance_list = []

# for (train_code, val_code) in folds:
#     train_idx = X["product_code"].isin(train_code)
#     val_idx = X["product_code"].isin(val_code)
#     x_train = X[train_idx]
#     y_train = y[train_idx]
#     x_val = X[val_idx]
#     y_val = y[val_idx]
#     x_test = test.copy()

#     x_train, x_val, x_test = scale(x_train, x_val, x_test, select_feature)

#     model = LogisticRegression(max_iter=500, C=0.0001, penalty="l2", solver="newton-cg")
#     model.fit(x_train[select_feature], y_train)
#     importance_list.append(model.coef_.ravel())

#     val_preds = model.predict_proba(x_val[select_feature])[:, 1]
#     lr_auc += roc_auc_score(y_val, val_preds) / 10
#     y_preds = model.predict(x_val[select_feature])
#     lr_acc += accuracy_score(y_val, y_preds) / 10
#     lr_test += model.predict_proba(x_test[select_feature])[:, 1] / 10
#     lr_oof_1[val_idx] = val_preds
#     lr_oof_2[val_idx] = y_preds

# print(
#     f"{Fore.GREEN}{Style.BRIGHT}Average auc = {round(lr_auc, 10)}, Average acc = {round(lr_acc, 10)}{Style.RESET_ALL}"
# )
# print(
#     f"{Fore.RED}{Style.BRIGHT}OOF auc = {round(roc_auc_score(y, lr_oof_1), 10)}, OOF acc = {round(accuracy_score(y, lr_oof_2), 10)}{Style.RESET_ALL}"
# )

#### Ensemble Methods

In [42]:
# select_feature = ["loading", "measurement_17", "m3_missing", "m5_missing"]

# X = copy_train.drop(["failure"], axis=1)
# y = copy_train["failure"].astype(int)
# test = copy_test.drop(["failure"], axis=1)

# lr_oof_1 = np.zeros(len(train))
# lr_oof_2 = np.zeros(len(train))
# lr2_test = np.zeros(len(test))
# lr_auc = 0
# lr_acc = 0
# importance_list = []
# lr_rate = [0.1, 0.01, 0.001]
# n_iter = [100, 300, 500]
# max_depth = [3, 5, 7, 9]
# n_est = [100, 200, 300, 400]

# param_grid = dict(max_depth=max_depth, max_iter=n_iter, learning_rate=lr_rate)


# kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# for fold_idx, (train_idx, val_idx) in enumerate(kf.split(X, y)):
#     print(f"Fold: {fold_idx}")
#     x_train, x_val = X.iloc[train_idx], X.iloc[val_idx]
#     y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
#     x_test = test.copy()

#     x_train, x_val, x_test = scale(x_train, x_val, x_test, select_feature)

#     # model = LogisticRegression(max_iter=100, penalty='l2', solver='newton-cg', random_state=42)
#     # model = RandomForestClassifier(n_estimators=5, max_depth=3, random_state=42)
#     # model = AdaBoostClassifier(model, n_estimators=100)
#     model = HistGradientBoostingClassifier(learning_rate=0.01, max_depth=3, max_iter=60)
#     # grid = GridSearchCV(model, param_grid, scoring=)
#     # grid_result = grid.fit(x_train[select_feature], y_train)
#     # print(f"最佳準確率: {grid_result.best_score_}，最佳參數組合：{grid_result.best_params_}")

#     # model = AdaBoostClassifier(model, n_estimators=20, learning_rate=0.01, random_state=42)
#     # model = HistGradientBoostingClassifier(random_state=42, max_iter=50)
#     model.fit(x_train[select_feature], y_train)
#     # importance_list.append(model.coef_.ravel())

#     val_preds = model.predict_proba(x_val[select_feature])[:, 1]
#     lr_auc += roc_auc_score(y_val, val_preds) / 5
#     y_preds = model.predict(x_val[select_feature])
#     lr_acc += accuracy_score(y_val, y_preds) / 5
#     lr2_test += model.predict_proba(x_test[select_feature])[:, 1] / 5
#     lr_oof_1[val_idx] = val_preds
#     lr_oof_2[val_idx] = y_preds

# print(importance_list)
# print(
#     f"{Fore.GREEN}{Style.BRIGHT}Average auc = {round(lr_auc, 5)}, Average acc = {round(lr_acc, 5)}{Style.RESET_ALL}"
# )
# print(
#     f"{Fore.RED}{Style.BRIGHT}OOF auc = {round(roc_auc_score(y, lr_oof_1), 5)}, OOF acc = {round(accuracy_score(y, lr_oof_2), 5)}{Style.RESET_ALL}"
# )