<a href="https://colab.research.google.com/github/Chikati2001/Car-price-prediction/blob/main/car_price_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [2]:
train_df = pd.read_csv("train.csv")
test_df  = pd.read_csv("test.csv")

print("Train shape:", train_df.shape)
print("Test shape :", test_df.shape)


Train shape: (19237, 18)
Test shape : (8245, 18)


In [3]:
def clean_mileage(x):
    return int(str(x).replace(" km", "").replace(",", ""))

def clean_engine(x):
    if "Turbo" in str(x):
        return float(str(x).split()[0])
    return float(x)

def clean_levy(x):
    if x == "-":
        return np.nan
    return float(x)


In [4]:
for df in [train_df, test_df]:
    df["Mileage"] = df["Mileage"].apply(clean_mileage)
    df["Engine volume"] = df["Engine volume"].apply(clean_engine)
    df["Levy"] = df["Levy"].apply(clean_levy)


In [5]:
train_df.drop(columns=["ID"], inplace=True)
test_ids = test_df["ID"]
test_df.drop(columns=["ID"], inplace=True)


In [6]:
num_cols = train_df.select_dtypes(include=["int64","float64"]).columns
cat_cols = train_df.select_dtypes(include=["object"]).columns

for col in num_cols:
    median = train_df[col].median()
    train_df[col].fillna(median, inplace=True)
    test_df[col].fillna(median, inplace=True)

for col in cat_cols:
    mode = train_df[col].mode()[0]
    train_df[col].fillna(mode, inplace=True)
    test_df[col].fillna(mode, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df[col].fillna(median, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df[col].fillna(median, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always b

In [7]:
X_train = train_df.drop("Price", axis=1)
y_train = train_df["Price"]

In [8]:
numeric_features = X_train.select_dtypes(include=["int64","float64"]).columns
categorical_features = X_train.select_dtypes(include=["object"]).columns

preprocessor = ColumnTransformer(
    transformers=[
        ("num", RobustScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)


In [9]:
lr_model = Pipeline([
    ("preprocess", preprocessor),
    ("model", LinearRegression())
])

lr_model.fit(X_train, y_train)


In [10]:
rf_model = Pipeline([
    ("preprocess", preprocessor),
    ("model", RandomForestRegressor(
        n_estimators=300,
        max_depth=15,
        random_state=42,
        n_jobs=-1
    ))
])

rf_model.fit(X_train, y_train)


In [11]:
# 1. Linear Regression Prediction
test_lr = lr_model.predict(test_df)

# 2. Random Forest Prediction
test_rf = rf_model.predict(test_df)

# 3. Ensemble Prediction
test_ensemble = (test_lr + test_rf) / 2

# 4. Conservative Price (Lower Bound)
test_conservative = test_ensemble * 0.95


In [12]:
final_predictions = pd.DataFrame({
    "ID": test_ids,
    "Linear_Price": test_lr,
    "RandomForest_Price": test_rf,
    "Ensemble_Price": test_ensemble,
    "Conservative_Price": test_conservative
})

final_predictions.head()


Unnamed: 0,ID,Linear_Price,RandomForest_Price,Ensemble_Price,Conservative_Price
0,44020629,49941.942416,17624.205028,33783.073722,32093.920036
1,45784798,25448.428607,16214.433344,20831.430975,19789.859426
2,45800257,2079.752624,7402.562647,4741.157635,4504.099753
3,45797981,31978.594982,5872.593382,18925.594182,17979.314473
4,45814303,-2921.526305,3958.649596,518.561645,492.633563


In [13]:
def predict_car_price(car_features: dict):
    """
    car_features: dictionary with the same keys as training features
    returns: price predictions from different models
    """
    input_df = pd.DataFrame([car_features])

    lr_price = lr_model.predict(input_df)[0]
    rf_price = rf_model.predict(input_df)[0]
    ensemble_price = (lr_price + rf_price) / 2

    return {
        "Linear_Regression_Price": round(lr_price, 2),
        "Random_Forest_Price": round(rf_price, 2),
        "Ensemble_Price": round(ensemble_price, 2)
    }


In [17]:
question = {
    "Levy": 800,
    "Manufacturer": "TOYOTA",
    "Model": "Camry",
    "Prod. year": 2015,
    "Category": "Sedan",
    "Leather interior": "Yes",
    "Fuel type": "Petrol",
    "Engine volume": 2.5,
    "Mileage": 120000,
    "Cylinders": 4,
    "Gear box type": "Automatic",
    "Drive wheels": "Front",
    "Doors": "04-May",
    "Wheel": "Left wheel",
    "Color": "White",
    "Airbags": 10
}

predict_car_price(question)


{'Linear_Regression_Price': np.float64(11272.77),
 'Random_Forest_Price': np.float64(24268.93),
 'Ensemble_Price': np.float64(17770.85)}