In [1]:
import pandas as pd

# Sample data
data = {
    'Brand': ['BrandA', 'BrandB', 'BrandC', 'BrandA', 'BrandD'],
    'Model': ['Model1', 'Model2', 'Model3', 'Model1', 'Model4'],
    'Year': [2015, 2016, 2017, 2018, 2019],
    'Kilometers_Driven': [20000, 30000, 40000, 15000, 10000],
    'Mileage': [50, 45, 55, 60, 65],
    'Price': [50000, 60000, 70000, 55000, 65000]
}
df = pd.DataFrame(data)

In [2]:
df

Unnamed: 0,Brand,Model,Year,Kilometers_Driven,Mileage,Price
0,BrandA,Model1,2015,20000,50,50000
1,BrandB,Model2,2016,30000,45,60000
2,BrandC,Model3,2017,40000,55,70000
3,BrandA,Model1,2018,15000,60,55000
4,BrandD,Model4,2019,10000,65,65000


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

# Features and target
X = df.drop('Price', axis=1)
y = df['Price']

# Preprocessing pipeline
categorical_features = ['Brand', 'Model']
numerical_features = ['Year', 'Kilometers_Driven', 'Mileage']

categorical_transformer = OneHotEncoder(handle_unknown='ignore')
numerical_transformer = StandardScaler()

 
transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
]
preprocessor=ColumnTransformer(transformers)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
# Create a pipeline that includes preprocessing and model training
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Train the model
model.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['Year', 'Kilometers_Driven',
                                                   'Mileage']),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Brand', 'Model'])])),
                ('regressor', LinearRegression())])

In [6]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")

Mean Absolute Error: 718.2926134745649
Mean Squared Error: 515944.27857212065
Root Mean Squared Error: 718.2926134745649


In [10]:
 # Example new data
new_data = pd.DataFrame({
    'Brand': ['BrandB'],
    'Model': ['Model2'],
    'Year': [1980],
    'Kilometers_Driven': [5000],
    'Mileage': [70]
})

# Predict the price
predicted_price = model.predict(new_data)
print(f"Predicted Price: {predicted_price[0]}")

    Brand   Model  Year  Kilometers_Driven  Mileage
0  BrandB  Model2  1980               5000       70
Predicted Price: 12356.910985819304


In [41]:
print(transformers)

[('num', StandardScaler(), ['Year', 'Kilometers_Driven', 'Mileage']), ('cat', OneHotEncoder(handle_unknown='ignore'), ['Brand', 'Model'])]


In [8]:
print(X_train.shape)

(4, 5)


In [9]:
print(y_train.shape)

(4,)


In [11]:
new_data

Unnamed: 0,Brand,Model,Year,Kilometers_Driven,Mileage
0,BrandB,Model2,1980,5000,70
