#### ðŸ”¹ Step 1: Import Libraries

In [93]:
import numpy as np
import pandas as pd

df = pd.read_csv('../data/used_cars_data_cleaned.csv')
df

Unnamed: 0,Brand_Name,Model_Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,Maruti,Wagon R LXI CNG,Mumbai,2010,72000.0,CNG,Manual,First,26.60,998.0,58.16,5.0,1.750
1,Hyundai,Creta 1.6 CRDi SX Option,Pune,2015,41000.0,Diesel,Manual,First,19.67,1582.0,126.20,5.0,12.500
2,Honda,Jazz V,Chennai,2011,46000.0,Petrol,Manual,First,18.20,1199.0,88.70,5.0,4.500
3,Maruti,Ertiga VDI,Chennai,2012,87000.0,Diesel,Manual,First,20.77,1248.0,88.76,7.0,6.000
4,Audi,A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670.0,Diesel,Automatic,Second,15.20,1968.0,140.80,5.0,17.740
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7241,Volkswagen,Vento Diesel Trendline,Hyderabad,2011,89411.0,Diesel,Manual,First,20.54,1598.0,103.60,5.0,3.875
7242,Volkswagen,Polo GT TSI,Mumbai,2015,59000.0,Petrol,Automatic,First,17.21,1197.0,103.60,5.0,6.955
7243,Nissan,Micra Diesel XV,Kolkata,2012,28000.0,Diesel,Manual,First,23.08,1461.0,63.10,5.0,4.250
7244,Volkswagen,Polo GT TSI,Pune,2013,52262.0,Petrol,Automatic,Third,17.20,1197.0,103.60,5.0,4.500


#### ðŸ”¹ Step 2: Create Target & Feature Split

In [94]:
X = df.drop(columns=['Price'])
y = df['Price']

### ðŸ”¹ Step 3: Feature Engineering on Numerical Columns

In [95]:
## 3.1 Car Age (More informative than Year)
CURRENT_YEAR = 2020
X['Car_Age'] = CURRENT_YEAR - X['Year']
X.drop(columns=['Year'], inplace=True)


In [96]:
## 3.2 Log Transform Skewed Features

X['Kilometers_Driven_log'] = np.log1p(X['Kilometers_Driven'])
X['Engine_log'] = np.log1p(X['Engine'])
X['Power_log'] = np.log1p(X['Power'])
y_log = np.log1p(y)

In [97]:
### 3.3 Drop Original Skewed Columns (Optional)
X.drop(columns=['Kilometers_Driven', 'Engine', 'Power'], inplace=True)

### ðŸ”¹ Step 4: Ownership Encoding (Ordinal)

In [98]:
owner_map = {
    'First': 1,
    'Second': 2,
    'Third': 3,
    'Fourth & Above': 4
}

X['Owner_Type'] = X['Owner_Type'].map(owner_map)


### ðŸ”¹ Step 5: Categorical Encoding

In [99]:
#5.1 One-Hot Encoding (Low Cardinality)
low_card_cols = ['Fuel_Type', 'Transmission', 'Seats','Location']
X = pd.get_dummies(X, columns=low_card_cols, drop_first=True)

In [100]:
#5.2 Frequency Encoding (High Cardinality)
for col in ['Brand_Name', 'Model_Name']:
    freq = df[col].value_counts()
    X[f'{col}_freq'] = df[col].map(freq)

In [101]:
X.drop(columns=['Brand_Name', 'Model_Name'], inplace=True)
X

Unnamed: 0,Owner_Type,Mileage,Car_Age,Kilometers_Driven_log,Engine_log,Power_log,Fuel_Type_Diesel,Fuel_Type_Electric,Fuel_Type_LPG,Fuel_Type_Petrol,...,Location_Coimbatore,Location_Delhi,Location_Hyderabad,Location_Jaipur,Location_Kochi,Location_Kolkata,Location_Mumbai,Location_Pune,Brand_Name_freq,Model_Name_freq
0,1,26.60,10,11.184435,6.906755,4.080246,False,False,False,False,...,False,False,False,False,False,False,True,False,1444,18
1,1,19.67,5,10.621352,7.367077,4.845761,True,False,False,False,...,False,False,False,False,False,False,False,True,1339,15
2,1,18.20,9,10.736418,7.090077,4.496471,False,False,False,True,...,False,False,False,False,False,False,False,False,741,4
3,1,20.77,8,11.373675,7.130099,4.497139,True,False,False,False,...,False,False,False,False,False,False,False,False,1444,28
4,2,15.20,7,10.613271,7.585281,4.954418,True,False,False,False,...,True,False,False,False,False,False,False,False,285,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7241,1,20.54,9,11.401010,7.377134,4.650144,True,False,False,False,...,False,False,True,False,False,False,False,False,373,8
7242,1,17.21,5,10.985310,7.088409,4.650144,False,False,False,True,...,False,False,False,False,False,False,True,False,373,11
7243,1,23.08,8,10.239996,7.287561,4.160444,True,False,False,False,...,False,False,False,False,False,True,False,False,117,13
7244,3,17.20,7,10.864044,7.088409,4.650144,False,False,False,True,...,False,False,False,False,False,False,False,True,373,11


### ðŸ”¹ Step 6: Feature Interaction (Optional but Powerful)

In [102]:
X['Power_per_CC'] = X['Power_log'] / X['Engine_log']

### ðŸ”¹ Step 7: Handle Missing Values (If Any)

In [103]:
#Fill missing values only for numeric columns (Recommended)
num_cols = X.select_dtypes(include=['int64', 'float64']).columns
X[num_cols] = X[num_cols].fillna(X[num_cols].median())



In [104]:
# Fill categorical missing values with mode
cat_cols = X.select_dtypes(include=['object', 'category']).columns
for col in cat_cols:
    X[col] = X[col].fillna(X[col].mode()[0])


### ðŸ”¹ Step 8: Feature Scaling (For Linear / Distance Models)

In [105]:
## from sklearn.preprocessing import StandardScaler

## scaler = StandardScaler()
## X_scaled = scaler.fit_transform(X)


### ðŸ”¹ Step 9: Train-Test Split

In [106]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:
## ðŸ”¹ Step 5: Tree-Based Models (No Scaling Needed)
##5.1 Random Forest

from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=15,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)


NameError: name 'evaluate_model' is not defined

In [108]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

def evaluate_model(y_true, y_pred, model_name):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    print(f"{model_name}")
    print(f"RMSE: {rmse:.3f}")
    print(f"MAE : {mae:.3f}")
    print(f"R2  : {r2:.3f}\n")

evaluate_model(y_test, rf.predict(X_test), "Random Forest")


Random Forest
RMSE: 4.474
MAE : 1.640
R2  : 0.850



In [110]:
y_pred = rf.predict(X_test)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(6,6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()],
         [y_test.min(), y_test.max()],
         'r--')
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Actual vs Predicted Price")
plt.show()
