In [None]:
# 라이브려ㅓ리 불러오기

from datasets import load_dataset
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
print("✅ 라이브러리 불러오기 완료")

In [None]:

# ✅ 데이터 로딩: 저장된 데이터가 있으면 불러오고, 없으면 HuggingFace에서 다운로드하여 저장
import os
import pandas as pd
from datasets import load_dataset
from sklearn.preprocessing import LabelEncoder

data_path = "./data/student_data.pkl"

if not os.path.exists(data_path):
    print("🔄 HuggingFace에서 데이터를 처음 다운로드합니다...")
    dataset = load_dataset("neuralsorcerer/student-performance")
    df = dataset["train"].to_pandas()

    # 열 추출 및 결측치 제거
    cols = ["Gender", "ParentalEducation", "TestScore_Math", "TestScore_Reading", "TestScore_Science"]
    df = df[cols].dropna()

    # 범주형 변수 인코딩
    le_gender = LabelEncoder()
    le_parent = LabelEncoder()
    df["Gender"] = le_gender.fit_transform(df["Gender"])
    df["ParentalEducation"] = le_parent.fit_transform(df["ParentalEducation"])

    # 저장
    df.to_pickle(data_path)
    print(f"✅ 전처리 완료 및 저장됨: {data_path}")
else:
    print(f"📁 기존 전처리된 데이터를 불러옵니다: {data_path}")
    df = pd.read_pickle(data_path)

df.head()


In [None]:
# 명목변수 인코딩
le_gender = LabelEncoder()
le_parent = LabelEncoder()
df["Gender"] = le_gender.fit_transform(df["Gender"])
df["ParentalEducation"] = le_parent.fit_transform(df["ParentalEducation"])
df.head()

In [None]:

# ✅ 학습용/테스트용 데이터 분할 & 저장 또는 재사용
import os

split_dir = "./data"
X_train_path = os.path.join(split_dir, "X_train.pkl")
X_test_path = os.path.join(split_dir, "X_test.pkl")
y_train_path = os.path.join(split_dir, "y_train.pkl")
y_test_path = os.path.join(split_dir, "y_test.pkl")

if not os.path.exists(X_train_path):
    from sklearn.model_selection import train_test_split
    X = df[["Gender", "ParentalEducation"]]
    y = df["TestScore_Math"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    X_train.to_pickle(X_train_path)
    X_test.to_pickle(X_test_path)
    y_train.to_pickle(y_train_path)
    y_test.to_pickle(y_test_path)
    print("✅ 데이터 분할 및 저장 완료")
else:
    X_train = pd.read_pickle(X_train_path)
    X_test = pd.read_pickle(X_test_path)
    y_train = pd.read_pickle(y_train_path)
    y_test = pd.read_pickle(y_test_path)
    print("📁 분할된 데이터를 재사용합니다.")


In [None]:
df.groupby("Gender")[["TestScore_Math", "TestScore_Reading", "TestScore_Science"]].mean()

In [None]:
sns.boxplot(x="Gender", y="TestScore_Math", data=df)
plt.title("성별에 따른 수학 점수 분포")
plt.xlabel("Gender (0: Female, 1: Male)")
plt.ylabel("TestScore_Math")
plt.tight_layout()
plt.show()

In [None]:
X = df[["Gender", "ParentalEducation"]]
y = df["TestScore_Math"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
print("RMSE:", mean_squared_error(y_test, y_pred, squared=False))
print("R²:", r2_score(y_test, y_pred))

In [None]:
plt.scatter(y_test, y_pred)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], "r--")
plt.xlabel("실제 수학 점수")
plt.ylabel("예측된 수학 점수")
plt.title("예측 vs 실제 수학 점수")
plt.tight_layout()
plt.show()