In [2]:
# ============================================
# 1. Imports
# ============================================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import pickle

In [3]:
# 2. Load & Basic Cleaning
# ============================================
df = pd.read_csv("Cardetails.csv")

# Drop unused column
if 'torque' in df.columns:
    df.drop(columns=['torque'], inplace=True)

# Remove missing and duplicate values
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

# Extract brand name from car name
df['name'] = df['name'].apply(lambda x: x.split(' ')[0].strip())

# Function to clean numeric values from strings
def clean_numeric(value):
    value = str(value).split(' ')[0].strip()
    try:
        return float(value)
    except ValueError:
        return 0.0

df['mileage'] = df['mileage'].apply(clean_numeric)
df['engine'] = df['engine'].apply(clean_numeric)
df['max_power'] = df['max_power'].apply(clean_numeric)

In [4]:
# 3. Features & Target
# ============================================
X = df.drop(columns=['selling_price'])
y = df['selling_price']

# Define categorical and numeric columns
categorical_cols = ['name', 'fuel', 'seller_type', 'transmission', 'owner']
numeric_cols = [col for col in X.columns if col not in categorical_cols]


In [5]:
# 4. Preprocessor + Model Pipeline
# ============================================
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', 'passthrough', numeric_cols)
    ]
)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=300, random_state=42))
])


In [6]:
# 5. Train-Test Split
# ============================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [7]:
# 6. Train Model
# ============================================
model.fit(X_train, y_train)



In [8]:
# 7. Evaluate Model
# ============================================
y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

print(f"R² Score: {r2:.4f} ({r2*100:.2f}%)")
print(f"MAE     : {mae:.2f}")
print(f"RMSE    : {rmse:.2f}")
print(f"MAPE    : {mape:.2f}%")


R² Score: 0.8964 (89.64%)
MAE     : 73954.20
RMSE    : 135107.62
MAPE    : 17.34%


In [9]:
# 9. Example Prediction
# ============================================
# Example input (make sure categories match your dataset)
sample_input = pd.DataFrame([{
    'name': 'Toyota',
    'year': 2022,
    'km_driven': 12000,
    'fuel': 'Diesel',
    'seller_type': 'Dealer',
    'transmission': 'Manual',
    'owner': 'First Owner',
    'mileage': 18.0,
    'engine': 1498.0,
    'max_power': 100.0,
    'seats': 5.0
}])

predicted_price = model.predict(sample_input)
print(f"Predicted Price for sample input: ₹{predicted_price[0]:,.2f}")


Predicted Price for sample input: ₹896,098.28


In [10]:
import joblib

# Suppose your trained model variable is called 'model'
joblib.dump(model, "model.joblib")  # you can name it anything, e.g., diabetes_model.joblib


['model.joblib']