In [None]:
!pip -q install autogluon

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        pass
    
pd.set_option('display.max_columns', None)

In [None]:
train = pd.read_csv('/kaggle/input/hyundai-ai-challenge/train.csv')
test = pd.read_csv('/kaggle/input/hyundai-ai-challenge/test.csv')
submission = pd.read_csv('/kaggle/input/hyundai-ai-challenge/sample_submission.csv')

In [None]:
# datetime 컬럼 처리
train['ATA'] = pd.to_datetime(train['ATA'])
test['ATA'] = pd.to_datetime(test['ATA'])

# datetime을 여러 파생 변수로 변환
for df in [train, test]:
    df['year'] = df['ATA'].dt.year
    df['month'] = df['ATA'].dt.month
    df['day'] = df['ATA'].dt.day
    df['weekday'] = df['ATA'].dt.weekday

# datetime 컬럼 제거
# drop_col=['SAMPLE_ID','ATA','U_WIND','V_WIND','ID','SHIPMANAGER','BN','AIR_TEMPERATURE','FLAG']
drop_col=['SAMPLE_ID','ATA','U_WIND','V_WIND','ID','SHIPMANAGER','FLAG','DRAUGHT','BREADTH', 'DEPTH']
# drop_col=['SAMPLE_ID','ATA','U_WIND','V_WIND']
train.drop(columns=drop_col, inplace=True)
test.drop(columns=drop_col, inplace=True)

In [None]:
oil=['DUBAI', 'BRENT', 'WTI']
train['min_oil'] = train[oil].min(axis=1)
test['min_oil'] = test[oil].min(axis=1)

In [None]:
# 'BREADTH' 컬럼의 고유한 값들 찾기
unique_values = train['LENGTH'].unique()

# 결과 출력
print("Unique values in 'LENGTH':", unique_values)

In [None]:
missing_rows = train[train['LENGTH'].isna()]
missing_rows

In [None]:
train=train[~train['LENGTH'].isna()]

In [None]:
# int_col=['DRAUGHT','BREADTH', 'DEPTH', 'LENGTH','BUILT','DEADWEIGHT','GT','ATA_LT']
int_col=['LENGTH','BUILT','DEADWEIGHT','GT','ATA_LT']
train[int_col] = train[int_col].astype('int32')
test[int_col] = test[int_col].astype('int32')

In [None]:
def impute_missing_values(df, change_col, missing_col):
    condition_air_temp = df[missing_col].isna()
    
    replacement_values_air_temp = (df.loc[~condition_air_temp].groupby(change_col)[missing_col].mean().reset_index(name='Imputed_'+missing_col))
    merged_df = (df.loc[condition_air_temp].reset_index().merge(replacement_values_air_temp, on=change_col, how='left'))
#     print(merged_df.set_index('index')['Imputed_'+missing_col].values)
#     print(df.loc[condition_air_temp, missing_col])
    # 인덱스를 사용하여 원래의 데이터프레임에 결측치를 대체합니다.
    df.loc[condition_air_temp, missing_col] = merged_df.set_index('index')['Imputed_'+missing_col].values
    
    return df

In [None]:
def impute_test_missing_values(train_df, test_df, change_col, missing_col):
    condition_missing = test_df[missing_col].isna()

    replacement_values = (train_df.groupby(change_col)[missing_col].mean().reset_index(name='Imputed_' + missing_col))
    
    # Apply the means to the test data where values are missing
    merged_df = (test_df.loc[condition_missing].reset_index().merge(replacement_values, on=change_col, how='left'))
    # Replace the missing values in the original test data
    test_df.loc[condition_missing, missing_col] = merged_df.set_index('index')['Imputed_' + missing_col].values
    
    return test_df

In [None]:
grouping_columns_list = [['ARI_PO', 'year', 'month', 'day', 'ATA_LT'],['ARI_PO', 'month', 'day', 'ATA_LT'],
                         ['ARI_PO', 'month', 'ATA_LT'],['ARI_PO', 'month'],['ARI_PO'],['month'],['year']]
for grouping_columns in grouping_columns_list:
    train = impute_missing_values(train, grouping_columns,'AIR_TEMPERATURE')
    test = impute_test_missing_values(train,test,grouping_columns,'AIR_TEMPERATURE')

print("temp done")

grouping_columns_list = [['ARI_PO', 'year', 'month', 'day', 'ATA_LT'],['ARI_PO', 'year', 'month', 'day'],
                     ['ARI_PO', 'month', 'ATA_LT'],['ARI_PO', 'year', 'month'],['ARI_PO'],['month'],['year']]
for grouping_columns in grouping_columns_list:
    train = impute_missing_values(train, grouping_columns,'BN')
    test = impute_test_missing_values(train,test,grouping_columns,'BN')

print("bn done")



In [None]:
def fill_na_with_mean(train_df, test_df):
    if train_df.isna().any().any():  # Check if there are any NaN values in the train dataset
        train_df.fillna(train_df.mean(), inplace=True)
        print("NaN values in train data have been replaced with column means.")
    else:
        print("No NaN values in train data.")
    
    if test_df.isna().any().any():  # Check if there are any NaN values in the test dataset
        test_df.fillna(train_df.mean(), inplace=True)  # Use means from train data to fill test data
        print("NaN values in test data have been replaced with column means from train data.")
    else:
        print("No NaN values in test data.")
    
    return train_df, test_df

# Example usage:
train, test = fill_na_with_mean(train, test)


 * 14  AIR_TEMPERATURE     213075 non-null  float64
 * 15  BN                  213954 non-null  float64

In [None]:
train.head(10)

In [None]:
train.info()

In [None]:
test.info()

In [None]:
# from tqdm import tqdm
# from sklearn.preprocessing import LabelEncoder
# import lightgbm as lgb
# from sklearn.ensemble import RandomForestRegressor
# import bisect
# import matplotlib.pyplot as plt

In [None]:
# # Categorical 컬럼 인코딩
# categorical_features = ['ARI_CO', 'ARI_PO', 'SHIP_TYPE_CATEGORY', 'ID', 'SHIPMANAGER', 'FLAG']
# encoders = {}

# for feature in tqdm(categorical_features, desc="Encoding features"):
#     le = LabelEncoder()
#     train[feature] = le.fit_transform(train[feature].astype(str))
#     le_classes_set = set(le.classes_)
#     test[feature] = test[feature].map(lambda s: '-1' if s not in le_classes_set else s)
#     le_classes = le.classes_.tolist()
#     bisect.insort_left(le_classes, '-1')
#     le.classes_ = np.array(le_classes)
#     test[feature] = le.transform(test[feature].astype(str))
#     encoders[feature] = le

In [None]:
# def train_and_evaluate(model, model_name, X_train, y_train):
#     print(f'Model Tune for {model_name}.')
#     model.fit(X_train, y_train)
    
#     feature_importances = model.feature_importances_
#     sorted_idx = feature_importances.argsort()

    
#     plt.figure(figsize=(10, len(X_train.columns)))
#     plt.title(f"Feature Importances ({model_name})")
#     plt.barh(range(X_train.shape[1]), feature_importances[sorted_idx], align='center')
#     plt.yticks(range(X_train.shape[1]), X_train.columns[sorted_idx])
#     plt.xlabel('Importance')
#     plt.show()
    
#     return model, feature_importances

# X_train = train.drop(columns='CI_HOUR')
# y_train = train['CI_HOUR']

# # Model Tune for LGBM
# lgbm_model, lgbm_feature_importances = train_and_evaluate(lgb.LGBMRegressor(), 'LGBM', X_train, y_train)
# rf_model, rf_feature_importances = train_and_evaluate(RandomForestRegressor(), 'Random Forest', X_train, y_train)

In [None]:
from autogluon.tabular import TabularDataset, TabularPredictor

In [None]:
time_limit=int(3600*11)
# time_limit=int(3600*0.01)
excluded_model_types = ['KNN', 'NN_TORCH']

* ['best_quality', 'high_quality', 'good_quality', 'medium_quality', 'optimize_for_deployment', 'ignore_text', 'interpretable', 'experimental_best_quality', 'experimental_extreme_quality', 'experimental_zeroshot_hpo', 'experimental_zeroshot_hpo_hybrid']

In [None]:
predictor = TabularPredictor(label='CI_HOUR', problem_type='regression',
                             eval_metric='mae',
                             sample_weight='auto_weight').fit(train,presets="good_quality",fit_weighted_ensemble = True,
                                                              excluded_model_types=excluded_model_types, num_gpus=2, time_limit=time_limit)

In [None]:
# predictor.feature_importance(train)

In [None]:
preds = predictor.predict(test)

In [None]:
!rm -rf /kaggle/working/*

In [None]:
submission['CI_HOUR']=preds
submission['CI_HOUR'] = submission['CI_HOUR'].clip(lower=0.0)
submission.to_csv('/kaggle/working/submission.csv', index=False)

In [None]:
submission