In [3]:
import pandas as pd

# Load the dataset
df = pd.read_csv("dataset.csv")

# Display basic info
print("Shape:", df.shape)
print("Missing values:\n", df.isnull().sum())
print("Data types:\n", df.dtypes)
print("Sample data:\n", df.head())


Shape: (1002, 17)
Missing values:
 name                0
description        56
make                0
model               0
year                0
price              23
engine              2
cylinders         105
fuel                7
mileage            34
transmission        2
trim                1
body                3
doors               7
exterior_color      5
interior_color     38
drivetrain          0
dtype: int64
Data types:
 name               object
description        object
make               object
model              object
year                int64
price             float64
engine             object
cylinders         float64
fuel               object
mileage           float64
transmission       object
trim               object
body               object
doors             float64
exterior_color     object
interior_color     object
drivetrain         object
dtype: object
Sample data:
                               name  \
0     2024 Jeep Wagoneer Series II   
1  2024 Jeep Grand 

In [4]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Drop unnecessary columns
df.drop(['name', 'description'], axis=1, inplace=True)

# Drop rows with missing values for simplicity (can use imputation too)
df.dropna(inplace=True)

# Encode categorical columns
categorical_cols = df.select_dtypes(include='object').columns
le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

# Scale numeric columns
scaler = StandardScaler()
numeric_cols = ['year', 'mileage']
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Features and target
X = df.drop("price", axis=1)
y = df["price"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print("RMSE:", mean_squared_error(y_test, y_pred, squared=False))
print("R^2 Score:", r2_score(y_test, y_pred))
