In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2


In [None]:
pip install optuna

Collecting optuna
  Downloading optuna-3.4.0-py3-none-any.whl (409 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m409.6/409.6 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.12.1-py3-none-any.whl (226 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.8/226.8 kB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.2.4 alembic-1.12.1 colorlog-6.7.0 optuna-3.4.0


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import lightgbm as lgb
from lightgbm import LGBMRegressor
import bisect
from tqdm import tqdm
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import KFold, train_test_split
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, VotingRegressor
from sklearn.model_selection import GridSearchCV

# Data Loading
train = pd.read_csv('/content/drive/MyDrive/train.csv').drop(columns=['SAMPLE_ID'])
test = pd.read_csv('/content/drive/MyDrive/test.csv').drop(columns=['SAMPLE_ID'])

KeyError: ignored

In [None]:
# Date Preprocessing
train['ATA'] = pd.to_datetime(train['ATA'])
test['ATA'] = pd.to_datetime(test['ATA'])

for df in [train, test]:
    df['year'] = df['ATA'].dt.year
    df['month'] = df['ATA'].dt.month
    df['day'] = df['ATA'].dt.day
    df['hour'] = df['ATA'].dt.hour
    df['minute'] = df['ATA'].dt.minute
    df['weekday'] = df['ATA'].dt.weekday

train.drop(columns='ATA', inplace=True)
test.drop(columns='ATA', inplace=True)

In [None]:
# Encoding
categorical_features = ['ARI_CO', 'ARI_PO', 'SHIP_TYPE_CATEGORY', 'ID', 'SHIPMANAGER', 'FLAG']
encoders = {}
for feature in tqdm(categorical_features, desc="Encoding features"):
    le = LabelEncoder()
    train[feature] = le.fit_transform(train[feature].astype(str))

    # Mapping unseen data in test set to '-1'
    le_classes_set = set(le.classes_)
    test[feature] = test[feature].map(lambda s: '-1' if s not in le_classes_set else s)

    le_classes = le.classes_.tolist()
    bisect.insort_left(le_classes, '-1')
    le.classes_ = np.array(le_classes)
    test[feature] = le.transform(test[feature].astype(str))
    encoders[feature] = le

In [None]:
# Missing Values Handling
train.fillna(train.mean(), inplace=True)
test.fillna(train.mean(), inplace=True)

In [None]:
# Splitting features and target
X_train = train.drop(columns='CI_HOUR')
y_train = train['CI_HOUR']

In [None]:
def train_and_evaluate(models, X_train, y_train):
    ensemble_model = VotingRegressor(models)
    ensemble_model.fit(X_train, y_train)

    feature_importances = []
    # 학습된 모델에 접근하기 위해 ensemble_model.estimators_를 사용
    for (model_name, _), trained_model in zip(models, ensemble_model.estimators_):
        print(f'Model Tune for {model_name}.')

        if isinstance(trained_model, CatBoostRegressor):
            importances = trained_model.get_feature_importance()
        else:
            importances = trained_model.feature_importances_

        feature_importances.append(importances)

        sorted_idx = importances.argsort()
        plt.figure(figsize=(10, len(X_train.columns)))
        plt.title(f"Feature Importances ({model_name})")
        plt.barh(range(X_train.shape[1]), importances[sorted_idx], align='center')
        plt.yticks(range(X_train.shape[1]), X_train.columns[sorted_idx])
        plt.xlabel('Importance')
        plt.show()

    avg_feature_importance = np.mean(feature_importances, axis=0)

    return ensemble_model, avg_feature_importance

X_train = train.drop(columns='CI_HOUR')
y_train = train['CI_HOUR']


# 앙상블에 사용할 모델들을 정의
models = [  # (11) 사용할 모델들의 리스트 정의
    ("Catboost", CatBoostRegressor())
    ('LGBM', LGBMRegressor())
    ('XGBoost', XGBRegressor())
    ('AdaBoost', AdaBoostRegressor())
]

# 수정된 train_and_evaluate 함수 호출
ensemble_model, avg_feature_importance = train_and_evaluate(models, X_train, y_train)

In [None]:
threshold = 5 # Your Threshold
low_importance_features = X_train.columns[avg_feature_importance < threshold]

X_train_reduced = X_train.drop(columns=low_importance_features)
X_test_reduced = test.drop(columns=low_importance_features)

In [None]:
# Predicting
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train_reduced, y_train, test_size=0.3, random_state=42)
ensemble_model = ensemble_model.fit(X_train_split, y_train_split)
y_pred = ensemble_model.predict(X_val_split)
mae = mean_absolute_error(y_val_split, y_pred)
print(f'Mean Absolute Error (MAE) on validation set: {mae}')

Learning rate set to 0.099411
0:	learn: 169.3467794	total: 80.3ms	remaining: 1m 20s
1:	learn: 167.1260634	total: 128ms	remaining: 1m 3s
2:	learn: 165.3284005	total: 158ms	remaining: 52.7s
3:	learn: 163.8711872	total: 191ms	remaining: 47.6s
4:	learn: 162.6569686	total: 222ms	remaining: 44.3s
5:	learn: 161.6276500	total: 257ms	remaining: 42.6s
6:	learn: 160.5851973	total: 289ms	remaining: 41s
7:	learn: 159.7741566	total: 322ms	remaining: 39.9s
8:	learn: 159.0857969	total: 358ms	remaining: 39.5s
9:	learn: 158.4729961	total: 393ms	remaining: 38.9s
10:	learn: 157.8711391	total: 424ms	remaining: 38.1s
11:	learn: 157.3993655	total: 458ms	remaining: 37.7s
12:	learn: 156.9361301	total: 490ms	remaining: 37.2s
13:	learn: 156.5639761	total: 531ms	remaining: 37.4s
14:	learn: 156.2169997	total: 578ms	remaining: 37.9s
15:	learn: 155.8974571	total: 613ms	remaining: 37.7s
16:	learn: 155.6269386	total: 645ms	remaining: 37.3s
17:	learn: 155.3821697	total: 685ms	remaining: 37.3s
18:	learn: 155.1584987	tot

In [None]:
final_predictions = ensemble_model.predict(test)
submit = pd.read_csv('/content/drive/MyDrive/sample_submission.csv')
submit['CI_HOUR'] = final_predictions
submit.to_csv('/content/drive/MyDrive/ensemble_model_submit.csv', index=False)