In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import joblib

In [12]:
DATA_PATH = '../data/processed/procssed.csv'
OUTPUT_MODEL_PATH = '../output/models/model.joblib'
df = pd.read_csv(DATA_PATH)
df.describe()

Unnamed: 0,Engine_Size,Mileage,Negotiable,Car_Age,mileage_par_age,Make_Aston Martin,Make_Audi,Make_BMW,Make_BYD,Make_Bentley,...,Region_Jeddah,Region_Jubail,Region_Khobar,Region_Makkah,Region_Qassim,Region_Rare,Region_Riyadh,Region_Tabouk,Region_Taef,Price
count,8035.0,8035.0,8035.0,8035.0,8035.0,8035.0,8035.0,8035.0,8035.0,8035.0,...,8035.0,8035.0,8035.0,8035.0,8035.0,8035.0,8035.0,8035.0,8035.0,8035.0
mean,-7.781924e-17,3.466493e-16,-1.414895e-17,1.414895e-17,3.537238e-18,0.000373,0.006472,0.016428,0.000498,0.000871,...,0.131176,0.011325,0.01145,0.028251,0.038457,0.045302,0.402862,0.016179,0.01705,70720.712988
std,1.000062,1.000062,1.000062,1.000062,1.000062,0.01932,0.080191,0.127123,0.022308,0.029505,...,0.337614,0.105823,0.106396,0.165701,0.192308,0.207978,0.490504,0.126172,0.129467,54088.861711
min,-1.507413,-1.371595,-0.6771428,-1.372674,-1.77955,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2659.06
25%,-0.8485003,-0.6816147,-0.6771428,-0.6778515,-0.6315726,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,45000.0
50%,-0.189588,-0.2236932,-0.6771428,-0.3304404,-0.08280287,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,58000.0
75%,0.7328893,0.490561,1.476793,0.3643817,0.5189421,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,73750.0
max,3.763886,3.148166,1.476793,8.875952,10.45521,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,340000.0


In [4]:
X = df.drop("Price", axis=1)
y = df["Price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [5]:
rf = RandomForestRegressor(random_state=42)
param_grid = {
        'n_estimators': [100, 200], 
        'max_features': [0.8, 1.0], 
        'min_samples_leaf': [1, 2]
} 
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                               cv=3, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')
    
grid_search.fit(X_train, y_train)
best_rf_model = grid_search.best_estimator_

Fitting 3 folds for each of 8 candidates, totalling 24 fits


In [10]:
import numpy as np

def evaluate_model(model, X_test, y_test):

    y_pred = model.predict(X_test)
    
    metrics = {
        'MSE': mean_squared_error(y_test, y_pred),
        'rmse': np.sqrt(mean_squared_error(y_test, y_pred)),
        'R2': r2_score(y_test, y_pred),
    }
    
    return metrics, y_pred

def print_metrics(metrics):

    print("نتائج تقييم النموذج:")
    print(f"الدقة (MSE): {metrics['MSE']:.4f}")
    print(f"الدقة (rmse): {metrics['rmse']:.4f}")
    print(f"الدقة (R2): {metrics['R2']:.4f}")


In [11]:
metrics, y_pred = evaluate_model(best_rf_model,X_test,y_test)
print_metrics(metrics)

نتائج تقييم النموذج:
الدقة (MSE): 587327140.7422
الدقة (rmse): 24234.8332
الدقة (R2): 0.8161


In [13]:
joblib.dump(best_rf_model,OUTPUT_MODEL_PATH)

['../output/models/model.joblib']