In [10]:
import seaborn as sns

diamonds = sns.load_dataset("diamonds")

diamonds.to_csv("data/diamonds.csv", index=False)

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler


def preprocess_data(data_path, test_size=0.2, target_name="price"):
    """
    Loads data, splits into train/test, performs normalization and one-hot encoding,
    saves preprocessed data with targets as CSV files.
    Args:
        data_path: Path to the CSV data file.
        test_size: Proportion of data for the test set (default: 0.2).
        target_name: Name of the target column (default: "price").
    """

    # Read data
    data = pd.read_csv(data_path)

    # Separate features and target
    features = data.drop(target_name, axis=1)
    target = data[[target_name]]

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        features, target, test_size=test_size, random_state=42
    )
    # Create pipelines
    numeric_pipeline = Pipeline([("scaler", StandardScaler())])
    categorical_pipeline = Pipeline([("ordinal", OrdinalEncoder())])

    # Separate numeric and categorical features
    numeric_features = X_train.select_dtypes(include=["int64", "float64"]).columns
    categorical_features = [
        col for col in X_train.columns if col not in numeric_features
    ]

    # Apply pipelines to training data
    X_train_numeric = numeric_pipeline.fit_transform(X_train[numeric_features])
    X_train_categorical = categorical_pipeline.fit_transform(
        X_train[categorical_features]
    )

    # Combine preprocessed features
    X_train_numeric = pd.DataFrame(X_train_numeric, columns=numeric_features)
    X_train_categorical = pd.DataFrame(
        X_train_categorical, columns=categorical_features
    )
    X_train_preprocessed = pd.concat(
        [X_train_numeric, X_train_categorical], axis=1
    ).reset_index(drop=True)

    # Apply pipelines (without fitting) to testing data
    X_test_numeric = numeric_pipeline.transform(X_test[numeric_features])
    X_test_categorical = categorical_pipeline.transform(X_test[categorical_features])

    X_test_numeric = pd.DataFrame(X_test_numeric, columns=numeric_features)
    X_test_categorical = pd.DataFrame(X_test_categorical, columns=categorical_features)
    X_test_preprocessed = pd.concat(
        [X_test_numeric, X_test_categorical], axis=1
    ).reset_index(drop=True)

    # Combine features and target into single dataframes
    train_data = pd.concat(
        [X_train_preprocessed, y_train.reset_index(drop=True)], axis=1
    )
    test_data = pd.concat([X_test_preprocessed, y_test.reset_index(drop=True)], axis=1)

    # Save preprocessed data with targets
    train_data.to_csv("data/train.csv", index=False)
    test_data.to_csv("data/test.csv", index=False)


# Set data path and run preprocessing
data_path = "data/diamonds.csv"
preprocess_data(data_path)

print("Preprocessing complete! Train and test data with targets saved as CSV files.")

Preprocessing complete! Train and test data with targets saved as CSV files.


In [11]:
diamonds.shape

(53940, 10)

In [2]:
import json

import pandas as pd
from joblib import dump
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler


def train_evaluate_save(
    train_data_path="data/train.csv",
    test_data_path="data/test.csv",
    target_name="price",
    model_path="models/model.joblib",
    metrics_path="metrics.json",
):
    """
    Loads train/test data, trains SGC regressor, evaluates on test set,
    saves model and metrics (RMSE) as JSON.
    Args:
        train_data_path: Path to training data CSV (default: "train.csv").
        test_data_path: Path to testing data CSV (default: "test.csv").
        target_name: Name of the target column (default: "price").
        model_path: Path to save the trained model (default: "model/model.joblib").
        metrics_path: Path to save the evaluation metrics (default: "metrics.json").
    """

    # Load data
    train_data = pd.read_csv(train_data_path)
    test_data = pd.read_csv(test_data_path)

    # Separate features and target
    X_train, y_train = train_data.drop(target_name, axis=1), train_data[target_name]
    X_test, y_test = test_data.drop(target_name, axis=1), test_data[target_name]

    # Create SGC regressor model with preprocessing pipeline
    model = SGDRegressor(loss="squared_error")

    # Train the model
    model.fit(X_train, y_train)

    # Make predictions on test set
    y_pred = model.predict(X_test)

    # Calculate RMSE
    rmse = mean_squared_error(y_test, y_pred, squared=False)

    # Save the model
    dump(model, model_path)

    # Save metrics as JSON
    metrics = {"rmse": rmse}
    with open(metrics_path, "w") as f:
        json.dump(metrics, f, indent=4)

    print(f"Model trained and saved to: {model_path}")
    print(f"Test set RMSE: {rmse:.4f}")
    print(f"Metrics saved to: {metrics_path}")


# Set file paths and run training/evaluation/saving
train_evaluate_save()

Model trained and saved to: models/model.joblib
Test set RMSE: 1357.0203
Metrics saved to: metrics.json
