In [None]:
!pip install lightgbm shap

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import lightgbm as lgb
import joblib
from google.cloud import bigquery, storage
import shap

In [None]:
# --- 設定値（あなたの環境に合わせて変更） ---
PROJECT_ID = "PROJECT_ID"
BQ_TABLE_PATH = f"{PROJECT_ID}.BQ_TABLE_PATH"
GCS_BUCKET_NAME = "GCS_BUCKET_NAME" 

In [None]:
# --- BigQueryからデータを読み込む ---
client = bigquery.Client(project=PROJECT_ID)
sql = f"SELECT * FROM `{BQ_TABLE_PATH}`"
df = client.query(sql).to_dataframe()

In [None]:
# --- 特徴量と目的変数を定義 ---
X = df.drop('math_score', axis=1)
y = df['math_score']

In [None]:
# --- データ分割 ---
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

In [None]:
# --- 前処理（One-Hot エンコーディング） ---
categorical_cols = X_train.select_dtypes(include=['object']).columns
X_train_encoded = pd.get_dummies(X_train, columns=categorical_cols, drop_first=True)
X_test_encoded = pd.get_dummies(X_test, columns=categorical_cols, drop_first=True)
final_columns = X_train_encoded.columns

In [None]:
final_columns
results = {}
models = {}

In [None]:
# --- モデル学習と評価 ---
lr_model = LinearRegression()
lr_model.fit(X_train_encoded, y_train)
lr_pred = lr_model.predict(X_test_encoded)
results["LinearRegression"] = r2_score(y_test, lr_pred)
models["LinearRegression"] = lr_model

lgb_model = lgb.LGBMRegressor(random_state=42)
lgb_model.fit(X_train_encoded, y_train)
lgb_pred = lgb_model.predict(X_test_encoded)
results["LightGBM"] = r2_score(y_test, lgb_pred)
models["LightGBM"] = lgb_model

print("モデル比較結果:", results)

In [None]:
# --- SHAP値の計算 (LightGBM用) ---
explainer = shap.TreeExplainer(lgb_model)
shap_values = explainer(X_test_encoded)
shap.summary_plot(shap_values, X_test_encoded, show=False)  # 可視化

In [None]:
# --- モデルと列情報を保存 ---
storage_client = storage.Client(project=PROJECT_ID)
bucket = storage_client.bucket(GCS_BUCKET_NAME)

for name, model in models.items():
    joblib.dump(model, f"{name}.pkl")
    blob = bucket.blob(f"models/math_predictor/v1/{name}.pkl")
    blob.upload_from_filename(f"{name}.pkl")
    
joblib.dump(list(final_columns), "feature_list.pkl")
blob_list = bucket.blob(f"models/math_predictor/v1/feature_list.pkl")
blob_list.upload_from_filename('feature_list.pkl')

print(f"モデルと特徴量リストが GCS へアップロードされました")