In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from google.colab import drive
drive.mount('/content/drive')

import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_squared_error
from scipy.stats import zscore

# 파일 경로 설정
train_path = '/content/drive/MyDrive/Colab Notebooks/contest/samsung/train.csv'
test_path = '/content/drive/MyDrive/Colab Notebooks/contest/samsung/test.csv'

# 데이터 불러오기
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# 특징 변수(X)와 목표 변수(y) 분리
X_train = train_df.drop(columns=['ID', 'y'])
y_train = train_df['y']

X_test = test_df.drop(columns=['ID'])

# Z-score를 사용하여 이상치 제거
z_scores = np.abs(zscore(X_train))
threshold = 3  # 일반적으로 Z-score가 3을 넘는 값을 이상치로 간주
filtered_entries = (z_scores < threshold).all(axis=1)

# 이상치가 제거된 데이터셋
X_train_filtered = X_train[filtered_entries]
y_train_filtered = y_train[filtered_entries]

# 하이퍼파라미터 그리드 설정
param_grid = {
    'num_leaves': [31, 61, 91],
    'learning_rate': [0.001, 0.01, 0.05],
    'n_estimators': [100, 500, 1000],
    'max_depth': [-1, 10, 20]
}

# K-Fold 교차 검증 설정
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# GridSearchCV 설정
grid_search = GridSearchCV(
    estimator=lgb.LGBMRegressor(boosting_type='gbdt'),
    param_grid=param_grid,
    cv=kf,
    scoring='neg_root_mean_squared_error',
    verbose=1,
    n_jobs=-1
)

# 이상치 제거된 데이터로 모델 훈련
grid_search.fit(X_train_filtered, y_train_filtered)

# 최적의 파라미터 확인
print(f"Best parameters: {grid_search.best_params_}")

# 최적의 파라미터로 모델 재훈련
best_model = grid_search.best_estimator_

# 전체 훈련 데이터에 대한 예측 및 성능 평가
train_pred = best_model.predict(X_train_filtered)
rmse = mean_squared_error(y_train_filtered, train_pred, squared=False)
print(f'Training RMSE: {rmse}')

# 테스트 데이터에 대한 예측 수행
y_pred = best_model.predict(X_test)

# 상위 33% 임계값 식별
threshold = np.percentile(y_pred, 67)
top_33_percent_mask = y_pred >= threshold

# 상위 33% 임계값과 샘플 수 출력
print(f"Top 33% threshold: {threshold:.4f}")
print(f"Number of samples in top 33%: {sum(top_33_percent_mask)}")

# ID와 예측된 y 값만을 포함한 데이터프레임 생성
predictions_df = test_df[['ID']].copy()
predictions_df['y_pred'] = y_pred

# 예측 결과 저장
output_path = '/content/drive/MyDrive/Colab Notebooks/contest/samsung/result_Z-score.csv'
predictions_df.to_csv(output_path, index=False)
print(f'Predictions saved to {output_path}')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Fitting 5 folds for each of 81 candidates, totalling 405 fits
