선형회귀 개선2, Cross Validation 적용  
mae = 1.1453

## 1. EDA

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import mean_absolute_error

from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

In [2]:
# 1. 데이터 로드
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")
submission_df = pd.read_csv("data/sample_submission.csv")

# test.csv에서 원본 id 저장
test_ids = test_df["id"].copy()
test_df = test_df.drop(columns=["id"])  # 이후 분석을 위해 삭제

In [3]:
train_df.describe()

Unnamed: 0,id,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Age
count,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0
mean,7499.5,1.316742,1.023813,0.347326,23.372701,10.104499,5.040622,6.704216,9.9668
std,4330.271354,0.287869,0.237697,0.091335,12.754705,5.691158,2.805236,3.598253,3.238065
min,0.0,0.1875,0.15,0.0,0.056699,0.028349,0.014175,0.042524,1.0
25%,3749.75,1.15,0.8875,0.2875,13.37742,5.69825,2.820775,3.827183,8.0
50%,7499.5,1.375,1.075,0.3625,23.657658,9.879801,4.904464,6.80388,10.0
75%,11249.25,1.5375,1.2,0.4125,32.205032,14.033003,7.002326,9.07184,11.0
max,14999.0,1.95,1.575,0.7,80.101512,48.477645,19.220961,24.564842,29.0


## 2. 데이터 전처리

In [4]:
# 'Sex' 라벨 인코딩
label_encoder = LabelEncoder()
train_df["Sex"] = label_encoder.fit_transform(train_df["Sex"])
test_df["Sex"] = label_encoder.transform(test_df["Sex"])

In [5]:
# Height가 0인 경우 평균값으로 대체
height_mean = train_df.loc[train_df["Height"] > 0, "Height"].mean()
train_df.loc[train_df["Height"] == 0, "Height"] = height_mean
test_df.loc[test_df["Height"] == 0, "Height"] = height_mean

In [6]:
# IQR 기반 이상치 제거 함수 정의 및 적용
def remove_outliers_iqr(df, cols, threshold=1.5):  # threshold=3.0에서 변경
    Q1 = df[cols].quantile(0.25)
    Q3 = df[cols].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - threshold * IQR
    upper_bound = Q3 + threshold * IQR
    return df[~((df[cols] < lower_bound) | (df[cols] > upper_bound)).any(axis=1)]

num_cols = train_df.select_dtypes(include=["float64"]).columns
train_df = remove_outliers_iqr(train_df, num_cols, threshold=1.5)

In [7]:
# 중복 데이터 제거 및 'id' 컬럼 삭제
train_df = train_df.drop_duplicates().drop(columns=["id"])

#### X, y 분리 / train-valid 분리

In [8]:
# X, y 분리
X = train_df.drop(columns=["Age"])
y = train_df["Age"]

In [9]:
# Train/Validation Split (8:2 비율)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# 3. 스케일링
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
test_scaled = scaler.transform(test_df)  # test data도 스케일링 적용

In [11]:
# 4. PCA 적용 (95% 이상의 분산을 설명하는 주성분 선택)
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_scaled)
X_valid_pca = pca.transform(X_valid_scaled)
test_pca = pca.transform(test_scaled)  # test data에도 pca 적용

## 3. 모델 학습

In [12]:
def separate_train(df): # 'count' 피쳐를 y로 구분
    X = df.drop(['Age', 'Sex'], axis=1)
    y = df['Age']
    return X, y

In [13]:
# # 5. 선형회귀 1
model = LinearRegression()
model.fit(X_train, y_train)

In [14]:
# 6. 교차검증 (Cross Validation)
from sklearn.model_selection import cross_val_score

# 교차 검증으로 성능 평가
cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='neg_mean_absolute_error')
print(f"Cross-validation MAE: {np.mean(cv_scores)} ± {np.std(cv_scores)}")

Cross-validation MAE: -1.1380660093622723 ± 0.013623053368536756


In [18]:
# 7. 모델 평가
y_prediction = model.predict(X_valid)
mae = mean_absolute_error(y_valid, y_prediction)
print(f"Validation MAE: {mae:.4f}")

Validation MAE: 1.1453


In [16]:
# 8. 테스트 데이터 예측
test_preds = model.predict(test_df)

In [17]:
# 9. 제출 파일 생성 (원래 id 유지)
submission = pd.DataFrame({"id": test_ids, "Age": np.round(test_preds, 3)})
submission.to_csv("download/sample_submission.csv", index=False)
print("sample_submission.csv 파일 생성 완료!")

sample_submission.csv 파일 생성 완료!
