In [None]:
!pip install numpy

In [None]:
!pip install pandas

In [None]:
!pip install seaborn

In [None]:
!pip install scikit-learn

In [None]:
!pip install xgboost

In [None]:
!pip install sklearn_quantile

In [None]:
!pip install dagshub 'mlflow>=2,<3' --force-reinstall

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import f_classif

from sklearn.ensemble import RandomForestRegressor
from sklearn_quantile import RandomForestQuantileRegressor
from xgboost import XGBRegressor

### Training & Testing

In [None]:
def load_and_preprocess(csv_path):
    df_raw = pd.read_csv(csv_path)

    # Drop kolom tidak penting
    df_raw = df_raw.drop(columns=['car_ID', 'symboling'])

    # Normalisasi nama mobil dan ekstraksi brand
    df_raw['CarName'] = df_raw['CarName'].str.lower()
    df_raw['brand'] = df_raw['CarName'].apply(lambda x: x.split(' ')[0])
    df_raw['brand'] = df_raw['brand'].replace({
        'maxda': 'mazda',
        'porcshce': 'porsche',
        'toyouta': 'toyota',
        'vokswagen': 'volkswagen',
        'vw': 'volkswagen'
    })
    df_raw.drop(columns=['CarName'], inplace=True)

    # Konversi harga ke IDR
    df_raw['price'] = df_raw['price'] * 16000

    # Encoding kolom kategorikal
    cat_cols = df_raw.select_dtypes(include='object').columns
    oh_encoder = OneHotEncoder(handle_unknown='ignore')
    oh_encoded = oh_encoder.fit_transform(df_raw[cat_cols])
    df_category_encoded = pd.DataFrame(oh_encoded.toarray(), columns=oh_encoder.get_feature_names_out())

    # Menggabungkan semua kolom
    df_non_category = df_raw.select_dtypes(exclude='object')
    df_result = pd.concat([df_non_category, df_category_encoded], axis=1)

    return df_raw, df_result, oh_encoder

In [None]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    return mae, rmse, r2

In [None]:
csv_path = "https://raw.githubusercontent.com/Cyberius8/DeployCamp_CarPricePrediction/refs/heads/main/data/CarPrice_Assignment.csv"
df_raw, df, encoder = load_and_preprocess(csv_path)

In [None]:
from scipy.stats import f_oneway

categories = df_raw.select_dtypes(include='object').columns
cat_dict = {}
for c in categories:
    cat_dict[c] = df_raw.groupby(c)["price"].apply(list)

    f_stat, p_val = f_oneway(*cat_dict[c])
    print(f"{c} F-statistic: {f_stat}")
    print(f"{c} p-value: {p_val}")
    print("---")

In [None]:
selected_category_features = [c for c in df.columns if "brand_" in c or "fuelsystem_" in c]
selected_noncategory_features = ['wheelbase', 'carlength', 'carwidth', 'curbweight', 'enginesize', 'boreratio', 'horsepower']
selected_features = np.concatenate([selected_category_features, selected_noncategory_features])

X = df[selected_features]
y = df['price']

models_name = ["rf", "rfq", "xgb"]

for m_name in models_name:
    print(f"Model: {m_name}")
    scores_r2 = []
    scores_mae = []
    scores_rmse = []

    if m_name == "rf":
        model = RandomForestRegressor(n_estimators=100, random_state=42)
    if m_name == "rfq":
        model = RandomForestQuantileRegressor(n_estimators=100, q=[0.05, 0.5, 0.95], random_state=42)
    if m_name == "xgb":
        model = XGBRegressor(learning_rate=0.05, random_state=42)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    for fold, (train_index, test_index) in enumerate(kf.split(X), start=1):
        X_train, X_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]

        model.fit(X_train, y_train)
        if m_name == "rfq":
            y_lower, y_pred, y_upper = model.predict(X_test)
        else:
            y_pred = model.predict(X_test)

        r2 = r2_score(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))

        print(f"Fold {fold}: R2 = {r2:.4f}")
        scores_r2.append(r2)
        scores_mae.append(mae)
        scores_rmse.append(rmse)

    print(f"R2 Mean: {np.mean(scores_r2):.4f}")
    print(f"MAE Mean: {np.mean(scores_mae):.4f}")
    print(f"RMSE Mean: {np.mean(scores_rmse):.4f}")
    print("-------")

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.ticker as ticker

x = range(len(y_pred))
sort_idx = np.argsort(y_pred)
y_pred_sorted = y_pred[sort_idx]
y_lower_sorted = y_lower[sort_idx]
y_upper_sorted = y_upper[sort_idx]
x = np.arange(len(y_pred_sorted))

plt.figure(figsize=(8, 5))
plt.plot(x, y_pred_sorted, color='blue', label='Prediksi')
plt.fill_between(x, y_lower_sorted, y_upper_sorted,
                 color='blue', alpha=0.2, label='Rentang +/-')

# lebar rentang prediksi
range_width = y_upper_sorted - y_lower_sorted
plt.plot(x, range_width, color='red', alpha=0.5, linewidth=1, label='Lebar rentang')

# format rupiah
formatter = ticker.StrMethodFormatter('Rp {x:,.0f}')
plt.gca().yaxis.set_major_formatter(formatter)

# label
plt.gca().set_xlabel('')
plt.ylabel('Harga Mobil (Rp)')
plt.title('Plot Hasil Prediksi Harga Mobil (Random Forest Quantile)')
plt.legend()
plt.tight_layout()
plt.show()


### MLFLow

In [None]:
import mlflow
import mlflow.sklearn
# from mlflow.tracking import MlflowClient


# Setup MLflow ke Dagshub
# MLFLOW_TRACKING_URI = "https://dagshub.com/Cyberius8/DeployCamp_CarPricePrediction.mlflow"
# mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
# mlflow.set_experiment("Car Price Prediction")

In [None]:
import dagshub
dagshub.init(repo_owner='Cyberius8', repo_name='DeployCamp_CarPricePrediction', mlflow=True)

In [None]:
# memastikan mlflow membaca tracking URI yg sesuai
mlflow_client = MlflowClient()
mlflow_client.search_experiments()

In [None]:
# memastikan experiment berjalan
mlflow.set_experiment("quick-test")

with mlflow.start_run():
    mlflow.log_param("test_param", 123)
    mlflow.log_metric("test_metric", 0.456)

#### Preprocessing

In [None]:
csv_path = "https://raw.githubusercontent.com/Cyberius8/DeployCamp_CarPricePrediction/refs/heads/main/data/CarPrice_Assignment.csv"
df_raw, df, encoder = load_and_preprocess(csv_path)

#### Training & Testing

In [None]:
# INITIALIZATION
mlflow.set_experiment("Car Price Prediction")

selected_category_features = [c for c in df.columns if "brand_" in c or "fuelsystem_" in c]
selected_noncategory_features = ['wheelbase', 'carlength', 'carwidth', 'curbweight', 'enginesize', 'boreratio', 'horsepower']
selected_features = np.concatenate([selected_category_features, selected_noncategory_features])

X = df[selected_features]
y = df['price']

models_name = ["RandomForestOneHotEncoding", "QuantileRFOneHotEncoding", "XGBRegressorOneHotEncoding"]
# models_name = ["RandomForest - OneHotEncoding"]

for m_name in models_name:
    print(f"Model: {m_name}")
    scores_r2 = []
    scores_mae = []
    scores_rmse = []

    if m_name == "RandomForestOneHotEncoding":
        model = RandomForestRegressor(n_estimators=100, random_state=42)
        model_params = {"n_estimators": 100}
    if m_name == "QuantileRFOneHotEncoding":
        model = RandomForestQuantileRegressor(n_estimators=100, q=[0.05, 0.5, 0.95], random_state=42)
        model_params = {"n_estimators": 100, "quantiles": [0.05, 0.5, 0.95]}
    if m_name == "XGBRegressorOneHotEncoding":
        model = XGBRegressor(learning_rate=0.05, random_state=42)
        model_params = {"learning_rate": 0.05}

    # RUN PER LOOP
    with mlflow.start_run(run_name=m_name) as run:
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        for fold, (train_index, test_index) in enumerate(kf.split(X), start=1):
            X_train, X_test = X.loc[train_index], X.loc[test_index]
            y_train, y_test = y.loc[train_index], y.loc[test_index]
    
            model.fit(X_train, y_train)
            if m_name == "QuantileRFOneHotEncoding":
                y_lower, y_pred, y_upper = model.predict(X_test)
            else:
                y_pred = model.predict(X_test)
    
            r2 = r2_score(y_test, y_pred)
            mae = mean_absolute_error(y_test, y_pred)
            rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
            print(f"Fold {fold}: R2 = {r2:.4f}")
            scores_r2.append(r2)
            scores_mae.append(mae)
            scores_rmse.append(rmse)
    
        print(f"R2 Mean: {np.mean(scores_r2):.4f}")
        print(f"MAE Mean: {np.mean(scores_mae):.4f}")
        print(f"RMSE Mean: {np.mean(scores_rmse):.4f}")
        print("-------")
    
        # LOG PARAMS & MODEL
        mlflow.log_params(model_params)
        mlflow.sklearn.log_model(
            sk_model=model, 
            artifact_path=m_name
        )
    
        # REGISTER KE MODEL REGISTRY
        model_uri = f"runs:/{run.info.run_id}/{m_name}"
        mlflow.register_model(model_uri, m_name)
    
        # LOG METRICS + CV
        mlflow.log_metrics({
            "CV_MAE": np.mean(scores_r2),
            "CV_RMSE": np.mean(scores_mae),
            "CV_R2": np.mean(scores_rmse),
        })