## 1. EDA

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import mean_absolute_error

In [3]:
# 1. 데이터 로드
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")
submission_df = pd.read_csv("data/sample_submission.csv")

# test.csv에서 원본 id 저장
test_ids = test_df["id"].copy()
test_df = test_df.drop(columns=["id"])  # 이후 분석을 위해 삭제

In [4]:
train_df.describe()

Unnamed: 0,id,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Age
count,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0
mean,7499.5,1.316742,1.023813,0.347326,23.372701,10.104499,5.040622,6.704216,9.9668
std,4330.271354,0.287869,0.237697,0.091335,12.754705,5.691158,2.805236,3.598253,3.238065
min,0.0,0.1875,0.15,0.0,0.056699,0.028349,0.014175,0.042524,1.0
25%,3749.75,1.15,0.8875,0.2875,13.37742,5.69825,2.820775,3.827183,8.0
50%,7499.5,1.375,1.075,0.3625,23.657658,9.879801,4.904464,6.80388,10.0
75%,11249.25,1.5375,1.2,0.4125,32.205032,14.033003,7.002326,9.07184,11.0
max,14999.0,1.95,1.575,0.7,80.101512,48.477645,19.220961,24.564842,29.0


## 2. 데이터 전처리

In [74]:
# 'Sex' 라벨 인코딩
label_encoder = LabelEncoder()
train_df["Sex"] = label_encoder.fit_transform(train_df["Sex"])
test_df["Sex"] = label_encoder.transform(test_df["Sex"])

In [75]:
# Height가 0인 경우 평균값으로 대체
height_mean = train_df.loc[train_df["Height"] > 0, "Height"].mean()
train_df.loc[train_df["Height"] == 0, "Height"] = height_mean
test_df.loc[test_df["Height"] == 0, "Height"] = height_mean

In [76]:
# 새로운 특성 추가
train_df["Volume"] = train_df["Length"] * train_df["Diameter"] * train_df["Height"]
test_df["Volume"] = test_df["Length"] * test_df["Diameter"] * test_df["Height"]

# train_df["Shucked Weight Ratio"] = train_df["Shucked Weight"] / train_df["Weight"]
# test_df["Shucked Weight Ratio"] = test_df["Shucked Weight"] / test_df["Weight"]

# train_df["Viscera Weight Ratio"] = train_df["Viscera Weight"] / train_df["Weight"]
# test_df["Viscera Weight Ratio"] = test_df["Viscera Weight"] / test_df["Weight"]

train_df["Density"] = train_df["Weight"] / train_df["Volume"]
test_df["Density"] = test_df["Weight"] / test_df["Volume"]

# train_df["Edible Weight Ratio"] = (train_df["Shucked Weight"] + train_df["Viscera Weight"]) / train_df["Weight"]
# test_df["Edible Weight Ratio"] = (test_df["Shucked Weight"] + test_df["Viscera Weight"]) / test_df["Weight"]

---
추가
---

In [77]:
# vif 계산
from statsmodels.stats.outliers_influence import variance_inflation_factor

# VIF 계산 함수 정의
def calculate_vif(df):
    vif_data = pd.DataFrame()
    vif_data["Feature"] = df.columns
    vif_data["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    return vif_data

---

In [78]:
# IQR 기반 이상치 제거 함수 정의 및 적용
def remove_outliers_iqr(df, cols, threshold=3.0):
    Q1 = df[cols].quantile(0.25)
    Q3 = df[cols].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - threshold * IQR
    upper_bound = Q3 + threshold * IQR
    return df[~((df[cols] < lower_bound) | (df[cols] > upper_bound)).any(axis=1)]

num_cols = train_df.select_dtypes(include=["float64"]).columns
train_df = remove_outliers_iqr(train_df, num_cols, threshold=3.0)

In [79]:
# 중복 데이터 제거 및 'id' 컬럼 삭제
train_df = train_df.drop_duplicates().drop(columns=["id"])

#### X, y 분리 / train-valid 분리

In [80]:
# X, y 분리
X = train_df.drop(columns=["Age"])
y = train_df["Age"]

# Train/Validation Split (8:2 비율)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=42)


---
추가
---

In [81]:
# train_df.drop(["Length", "Diameter", "Height", "Weight","Volume", "Shucked Weight", "Viscera Weight"], axis=1, inplace=True)

---

In [82]:
# 3. 스케일링
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)

In [83]:
# X, y 분리
X = train_df.drop(columns=["Age"])
y = train_df["Age"]

# 다중공선성 검증 (VIF 계산)
vif_threshold = 10  # VIF 임계값 (일반적으로 5~10을 사용)
while True:
    vif_df = calculate_vif(X)
    max_vif = vif_df["VIF"].max()
    
    if max_vif < vif_threshold:
        break  # VIF 임계값 이하이면 종료

   # VIF 값이 가장 높은 특성 제거
    remove_feature = vif_df.loc[vif_df["VIF"] == max_vif, "Feature"].values[0]
    print(f"Removing feature due to high VIF ({max_vif:.2f}): {remove_feature}")
    X = X.drop(columns=[remove_feature])

# 최종 선택된 특성 확인
print("Final Features after VIF check:", X.columns.tolist())

Removing feature due to high VIF (1427.93): Length
Removing feature due to high VIF (429.01): Weight
Removing feature due to high VIF (228.23): Diameter
Removing feature due to high VIF (214.31): Volume
Removing feature due to high VIF (62.77): Viscera Weight
Removing feature due to high VIF (57.63): Height
Removing feature due to high VIF (25.47): Shell Weight
Final Features after VIF check: ['Sex', 'Shucked Weight', 'Density']


## 3. 모델 학습

In [None]:
# 4. GridSearch를 이용한 최적 하이퍼파라미터 찾기 (교차 검증 cv=5)
param_grid = {
    "n_estimators": [200, 500],
    "max_depth": [5, 10, 15, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [4, 6, 8, 10]
}

rf = RandomForestRegressor(random_state=42, n_jobs=-1)
grid_search = GridSearchCV(rf, param_grid, cv=10, scoring="neg_mean_absolute_error", n_jobs=-1, verbose=1)
grid_search.fit(X_train_scaled, y_train)

best_params = grid_search.best_params_
print("Best Parameters:", best_params)

In [11]:
# 5. 최적 하이퍼파라미터로 모델 학습
best_rf = RandomForestRegressor(**best_params, random_state=42, n_jobs=-1)
best_rf.fit(X_train_scaled, y_train)

#### 특성(변수) 선택

In [20]:
# # 6. Feature Importance 분석하여 중요도가 0.01 이하인 특성 제거
# feature_importances = best_rf.feature_importances_
# feature_names = X.columns

# # 중요도가 0.01 이하인 특성 찾기
# # low_importance_features = [feature_names[i] for i in range(len(feature_importances)) if feature_importances[i] < 0.01]
# # print("Removing Features with Importance < 0.01:", low_importance_features)

# # 중요도가 0.05 이하인 특성 찾기
# low_importance_features = [feature_names[i] for i in range(len(feature_importances)) if feature_importances[i] < 0.05]
# print("Removing Features with Importance < 0.05:", low_importance_features)

Removing Features with Importance < 0.01: []


In [None]:
# 중요도가 낮은 특성 제거
X_train = X_train.drop(columns=low_importance_features)
X_valid = X_valid.drop(columns=low_importance_features)
test_df = test_df.drop(columns=low_importance_features)

In [14]:
# 다시 스케일링
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
test_scaled = scaler.transform(test_df)

In [15]:
# 7. 다시 모델 학습 및 예측
best_rf.fit(X_train_scaled, y_train)
y_pred = best_rf.predict(X_valid_scaled)
mae = mean_absolute_error(y_valid, y_pred)
print(f"Validation MAE: {mae:.4f}")

Validation MAE: 1.2804


In [16]:
# 8. 테스트 데이터 예측
test_preds = best_rf.predict(test_scaled)

In [None]:
# 9. 제출 파일 생성 (원래 id 유지)
submission = pd.DataFrame({"id": test_ids, "Age": np.round(test_preds, 3)})
submission.to_csv("download/sample_submission.csv", index=False)
print("sample_submission.csv 파일 생성 완료!")


- CV = 5 고정
- 정규화: minmaxScaler
- 그리드서치 하이퍼파라미터 범위 수정
    - `"n_estimators"`: [200, 500]
    - `"max_depth"` : [5, 10, 15, 20]
    - `"min_samples_split"` : [2, 5, 10]
    - `"min_samples_leaf"` : [4, 6, 8, 10]
- 이상치 제거 기준 완화 (1.5 -> 3.0)
- 캐글 제출 파일 형식 조정
- train/validation split 비율 조정 (0.2 -> 0.1)