In [None]:
import argparse
import os
import shutil

import matplotlib.pyplot as plt
import mlflow
import mlflow.sklearn
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

In [None]:
input_data = "../data/raw/nyc_taxi_dataset.csv"
output_dir = "outputs"

lines = [f"学習データのパス: {input_data}", f"出力フォルダのパス: {output_dir}"]

for line in lines:
    print(line)

In [None]:
# 自動ロギングの有効化
mlflow.autolog(log_models=False)

In [None]:
# 学習データの読み込み
df = pd.read_csv(input_data)

In [None]:
df.head()

In [None]:
# X, y の作成
X = df.drop(columns="totalAmount")
y = df["totalAmount"]

# 学習データ、テストデータの分割
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=0
)

In [None]:
# データのサンプル数のロギング
mlflow.log_metric("Train samples", len(X_train))

# モデル学習
model = LinearRegression().fit(X_train, y_train)

In [None]:
# データのサンプル数のロギング
mlflow.log_metric("Test samples", len(X_test))

# モデル評価
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# 精度メトリックのロギング
mlflow.log_metric("mse", mse)
mlflow.log_metric("rmse", rmse)
mlflow.log_metric("r2", r2)

In [None]:
# outputs フォルダの作成
os.makedirs("./outputs", exist_ok=True)

# 実測値と予測値のプロット
plt.figure(figsize=(10, 7))
plt.scatter(y_test, y_pred)
plt.plot(y_test, y_test, color="r")
plt.title("Actual VS Predicted Values (Test set)")
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.savefig("./outputs/actuals_vs_predictions.png")

# プロット画像のロギング
mlflow.log_artifact("./outputs/actuals_vs_predictions.png")

In [None]:
# モデルの保存
model_path = os.path.join(output_dir, "models")

if Path(model_path).exists():
    shutil.rmtree(model_path)
else:
    os.makedirs(model_path, exist_ok=True)

mlflow.sklearn.save_model(model, model_path)

In [None]:
# MLflow UI の起動
#!mlflow ui  --backend-store-uri ./mlruns