In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from google.cloud import bigquery
import joblib
from google.cloud import storage

In [None]:
#  設定値（あなたの環境に合わせて変更） 
PROJECT_ID = "PROJECT_ID"
BQ_TABLE_PATH = f"{PROJECT_ID}.DATASET_NAME.TABLE_NAME" 
GCS_BUCKET_NAME = "GCS_BUCKET_NAME"
MODEL_GCS_PATH = f"gs://{GCS_BUCKET_NAME}/models/math_predictor/v1/"
MODEL_FILENAME = 'math_score_model.pkl'

In [None]:
#  1. BigQueryからデータを読み込む 
client = bigquery.Client(project=PROJECT_ID)
sql = f"SELECT * FROM `{BQ_TABLE_PATH}`"
df = client.query(sql).to_dataframe()

In [None]:
#  2. 特徴量と目的変数を定義 
X = df.drop('math_score', axis=1)
y = df['math_score']

In [None]:
#  3. データ分割 
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

In [None]:
#  4. 前処理（One-Hot エンコーディング） 
categorical_cols = X_train.select_dtypes(include=['object']).columns
X_train_encoded = pd.get_dummies(X_train, columns=categorical_cols, drop_first=True)
X_test_encoded = pd.get_dummies(X_test, columns=categorical_cols, drop_first=True)

final_columns = X_train_encoded.columns

In [None]:
#  5. モデル学習と評価 
model = LinearRegression()
model.fit(X_train_encoded, y_train)

y_pred_train = model.predict(X_train_encoded)
y_pred_test = model.predict(X_test_encoded)
train_r2 = r2_score(y_train,y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)
print(f"train_R2 スコア: {train_r2:.4f}")
print(f"test_R2 スコア: {test_r2:.4f}")

In [None]:
# --- 6. モデルと列情報を保存 ---
joblib.dump(model, MODEL_FILENAME)
feature_list = list(final_columns)
joblib.dump(feature_list, 'feature_list.pkl')
storage_client = storage.Client(project=PROJECT_ID)
bucket = storage_client.bucket(GCS_BUCKET_NAME)
blob = bucket.blob(f"models/math_predictor/v1/{MODEL_FILENAME}")
blob.upload_from_filename(MODEL_FILENAME)
blob_list = bucket.blob(f"models/math_predictor/v1/feature_list.pkl")
blob_list.upload_from_filename('feature_list.pkl')

print(f"モデルと特徴量リストが GCS へアップロードされました: {MODEL_GCS_PATH}")