<a href="https://colab.research.google.com/github/Batwan01/2024-Challenge/blob/main/history/24-9-9/MLP5%2BPReLU_base_ml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import (LinearRegression, Ridge, Lasso, ElasticNet, Lars, LassoLars, OrthogonalMatchingPursuit, BayesianRidge, ARDRegression, PassiveAggressiveRegressor, RANSACRegressor, HuberRegressor)
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.ensemble import BaggingRegressor, ExtraTreesRegressor, RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.multioutput import MultiOutputRegressor

# 재현 가능성을 위한 시드 고정
RANDOM_SEED = 18
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [13]:
# 데이터 로드 및 전처리
train_csv_path = "/content/drive/MyDrive/Colab Notebooks/contest/samsung/train.csv"
test_csv_path = "/content/drive/MyDrive/Colab Notebooks/contest/samsung/test.csv"
train = pd.read_csv(train_csv_path)
test = pd.read_csv(test_csv_path)
com = pd.concat([train, test])
com = com.drop(['x_2', 'x_6'], axis=1)

train_data = com[:40118]
X_test_df = com[40118:].drop('y', axis=1).iloc[:, 1:]

# 입력 데이터와 라벨 분리
X_train_df = train_data.drop(['ID', 'y'], axis=1)
y_train_df = train_data['y']

# 70 미만 값 제거
mask = y_train_df >= 70
X_train_df = X_train_df[mask]
y_train_df = y_train_df[mask]

# 스무딩 처리 (여기서는 가정된 변수명을 그대로 사용함)
X_train = pd.get_dummies(X_train_df, drop_first=True)
X_test = pd.get_dummies(X_test_df, drop_first=True)

# X_train과 X_test의 열 차원 맞추기
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

print(X_test.shape)
print(X_train.shape)

(4986, 9)
(40110, 9)


In [14]:
def get_stacking_ml_datasets(model, X_train_n, y_train_n, X_test_n, n_folds=5, fitting=True):
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=RANDOM_SEED)

    # 넘파이 배열로 변환
    if isinstance(X_train_n, pd.DataFrame):
        X_train_n = X_train_n.values
    if isinstance(X_test_n, pd.DataFrame):
        X_test_n = X_test_n.values

    if y_train_n.ndim == 1:
        train_fold_pred = np.zeros((X_train_n.shape[0], 1))
        test_pred = np.zeros((X_test_n.shape[0], 1, n_folds))
    else:
        train_fold_pred = np.zeros((X_train_n.shape[0], y_train_n.shape[1]))
        test_pred = np.zeros((X_test_n.shape[0], y_train_n.shape[1], n_folds))

    for folder_counter, (train_index, valid_index) in enumerate(kf.split(X_train_n, y_train_n)):
        X_tr = X_train_n[train_index]
        y_tr = y_train_n[train_index]

        # y_tr이 1차원이면 2차원으로 변환
        if y_tr.ndim == 1:
            y_tr = y_tr.reshape(-1, 1)

        X_te = X_train_n[valid_index]

        if fitting:
            model.fit(X_tr, y_tr)

        train_pred = model.predict(X_te)
        if train_pred.ndim == 1:
            train_pred = train_pred.reshape(-1, 1)
        train_fold_pred[valid_index] = train_pred

        test_fold_pred = model.predict(X_test_n)
        if test_fold_pred.ndim == 1:
            test_fold_pred = test_fold_pred.reshape(-1, 1)
        test_pred[:, :, folder_counter] = test_fold_pred

    test_pred_mean = np.mean(test_pred, axis=2)

    return train_fold_pred, test_pred_mean


In [15]:
# NRMSE 계산 함수
def lg_nrmse(gt, preds):
    all_nrmse = []
    for idx in range(gt.shape[1]):
        rmse = mean_squared_error(gt[:,idx], preds[:,idx], squared=False)
        nrmse = rmse / np.mean(np.abs(gt[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:14])
    return score

# 여러 모델 정의
base_ml = {
    'Linear Regression': LinearRegression(n_jobs=-1),
    'Ridge': Ridge(random_state=RANDOM_SEED),
    'Lasso': Lasso(random_state=RANDOM_SEED),
    'ElasticNet': ElasticNet(random_state=RANDOM_SEED),
    #'Lars': Lars(random_state=RANDOM_SEED),
    #'LassoLars': LassoLars(random_state=RANDOM_SEED),
    #'OMP': OrthogonalMatchingPursuit(),
    'BayesianRidge': MultiOutputRegressor(BayesianRidge()),
    #'ARDRegression': MultiOutputRegressor(ARDRegression()),
    #'PAR': MultiOutputRegressor(PassiveAggressiveRegressor(random_state=RANDOM_SEED)),
    'RANSAC': RANSACRegressor(random_state=RANDOM_SEED),
    'Huber': MultiOutputRegressor(HuberRegressor()),
    #'KNN': KNeighborsRegressor(n_jobs=-1),
    'DecisionTree': DecisionTreeRegressor(random_state=RANDOM_SEED),
    #'ExtraTree': ExtraTreeRegressor(random_state=RANDOM_SEED),
    'Bagging': BaggingRegressor(n_jobs=-1, random_state=RANDOM_SEED),
    'ExtraTrees': ExtraTreesRegressor(n_jobs=-1, random_state=RANDOM_SEED),
    'RandomForest': RandomForestRegressor(n_jobs=-1, random_state=RANDOM_SEED),
    'HistGradientBoosting': MultiOutputRegressor(HistGradientBoostingRegressor(random_state=RANDOM_SEED)),
    'XGBoost': XGBRegressor(tree_method='gpu_hist', gpu_id=0, n_jobs=-1, random_state=RANDOM_SEED),
    'LightGBM': MultiOutputRegressor(LGBMRegressor(n_jobs=-1, random_state=RANDOM_SEED)),
    'CatBoost': MultiOutputRegressor(CatBoostRegressor(task_type="GPU", devices='0', verbose=False, random_state=RANDOM_SEED))
}

In [16]:
# 스태킹 데이터셋 생성
meta_ml_X_train = []
meta_ml_X_test = []
n_folds = 5
for name, model in base_ml.items():
    print(f'Running {name}...')

    # 인덱스 리셋 (기존 인덱스 문제를 방지하기 위해)
    X_train_reset = X_train.reset_index(drop=True)
    y_train_reset = y_train_df.reset_index(drop=True)

    # MultiOutputRegressor일 경우 y_train을 2차원으로 변환
    if isinstance(model, MultiOutputRegressor):
        # y_train_df의 모양이 1차원일 경우 2차원으로 변환
        if y_train_reset.ndim == 1:
            y_train_reshaped = y_train_reset.values.reshape(-1, 1)
        else:
            y_train_reshaped = y_train_reset.values

        temp_X_train, temp_X_test = get_stacking_ml_datasets(model, X_train_reset, y_train_reshaped, X_test, n_folds)
    else:
        # MultiOutputRegressor가 아닌 경우 1차원 y_train 사용
        temp_X_train, temp_X_test = get_stacking_ml_datasets(model, X_train_reset, y_train_reset.values.ravel(), X_test, n_folds)

    meta_ml_X_train.append(temp_X_train)
    meta_ml_X_test.append(temp_X_test)

# 스태킹 데이터 결합
meta_ml_X_train = np.hstack(meta_ml_X_train)
meta_ml_X_test = np.hstack(meta_ml_X_test)


Running Linear Regression...
Running Ridge...
Running Lasso...
Running ElasticNet...
Running BayesianRidge...
Running RANSAC...
Running Huber...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

Running DecisionTree...
Running Bagging...


  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)


Running ExtraTrees...


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


Running RandomForest...


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


Running HistGradientBoosting...
Running XGBoost...



    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.



    E.g. tree_method = "hist", device = "cuda"



Running LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002617 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 32088, number of used features: 9
[LightGBM] [Info] Start training from score 83.642449
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003069 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 32088, number of used features: 9
[LightGBM] [Info] Start training from score 83.652839
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002478 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 32088, number of used features: 9
[LightGB

In [17]:
# 메타 모델 학습 및 예측
meta_clf = LinearRegression()
meta_clf.fit(meta_ml_X_train, y_train_df)
prediction = meta_clf.predict(meta_ml_X_test)

# 결과값을 반올림하여 처리
result = prediction.round(3)

In [18]:
# 결과 저장 (1차원 배열 처리)
submission = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/contest/samsung/sample_submission.csv")

# 'y' 열에 예측 결과 할당 (결과가 1차원 배열일 때)
submission['y'] = result  # 'y' 컬럼에 1차원 예측 결과를 넣음

# CSV 파일로 저장
submission.to_csv("/content/drive/MyDrive/Colab Notebooks/contest/samsung/results/Stacking_Predictions.csv", index=False)


In [None]:
def find_non_matching_ids(file1, file2):
    # 두 개의 CSV 파일을 읽어옴
    df1 = pd.read_csv(file1)
    df2 = pd.read_csv(file2)

    # y 값 기준으로 내림차순 정렬
    df1_sorted = df1.sort_values(by='y', ascending=False)
    df2_sorted = df2.sort_values(by='y', ascending=False)

    # file1의 상위 10% 항목 계산
    top_10_percent_count = int(len(df1_sorted) * 0.1)
    top_10_percent_ids_df1 = set(df1_sorted.head(top_10_percent_count)['ID'])

    # file2의 상위 10% ID 추출
    top_10_percent_ids_df2 = set(df2_sorted.head(top_10_percent_count)['ID'])

    # file1의 상위 10% 중 file2의 상위 10%에 없는 ID 계산
    non_matching_ids = top_10_percent_ids_df1 - top_10_percent_ids_df2
    num_non_matching = len(non_matching_ids)

    # 결과 출력
    print(f"file1의 상위 10% 항목 개수: {top_10_percent_count}")
    print(f"file1의 상위 10% 중 file2에 없는 항목 개수: {num_non_matching}")
    print(f"file1의 상위 10% 중 file2에 없는 항목 ID: {non_matching_ids}")

    return top_10_percent_count, num_non_matching, list(non_matching_ids)

# 사용 예시
file1 ='/content/drive/MyDrive/Colab Notebooks/contest/samsung/MLP_Residual_Connection_drop_x2_x6(0.752).csv' # best 성능 파일
file2 = '/content/drive/MyDrive/Colab Notebooks/contest/samsung/MLP_Residual_Connection_18.csv'
file3 = '/content/drive/MyDrive/Colab Notebooks/contest/samsung/result/Stacking_Predictions.csv' # 측정하고자 하는 파일
top_10_percent_count, num_non_matching, non_matching_ids = find_non_matching_ids(file1, file3)
top_10_percent_count, num_non_matching, non_matching_ids = find_non_matching_ids(file2, file3)

file1의 상위 10% 항목 개수: 498
file1의 상위 10% 중 file2에 없는 항목 개수: 32
file1의 상위 10% 중 file2에 없는 항목 ID: {'TEST_1978', 'TEST_3265', 'TEST_4828', 'TEST_1510', 'TEST_1178', 'TEST_0425', 'TEST_1309', 'TEST_1853', 'TEST_4876', 'TEST_3536', 'TEST_4070', 'TEST_0037', 'TEST_4154', 'TEST_0103', 'TEST_2220', 'TEST_2538', 'TEST_1574', 'TEST_1803', 'TEST_4033', 'TEST_4043', 'TEST_0635', 'TEST_4069', 'TEST_0898', 'TEST_1909', 'TEST_4179', 'TEST_0107', 'TEST_4738', 'TEST_0935', 'TEST_3718', 'TEST_2772', 'TEST_3512', 'TEST_4551'}
file1의 상위 10% 항목 개수: 498
file1의 상위 10% 중 file2에 없는 항목 개수: 35
file1의 상위 10% 중 file2에 없는 항목 ID: {'TEST_1321', 'TEST_1978', 'TEST_3265', 'TEST_4828', 'TEST_1510', 'TEST_1178', 'TEST_0425', 'TEST_1309', 'TEST_1853', 'TEST_4876', 'TEST_3536', 'TEST_4070', 'TEST_0037', 'TEST_4154', 'TEST_0103', 'TEST_2220', 'TEST_2538', 'TEST_1574', 'TEST_4511', 'TEST_4043', 'TEST_4033', 'TEST_0635', 'TEST_0784', 'TEST_4069', 'TEST_2632', 'TEST_0898', 'TEST_1909', 'TEST_0107', 'TEST_4738', 'TEST_0935', 'TES