In [25]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from google.colab import files
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer


data = df


print("Missing values in the dataset:")
print(data.isnull().sum())


X = data[['name', 'company', 'year', 'kms_driven', 'fuel_type']]
y = data['Price']


X['year'] = pd.to_numeric(X['year'], errors='coerce')
X['kms_driven'] = pd.to_numeric(X['kms_driven'], errors='coerce')


y = y.str.replace(',', '')
y = pd.to_numeric(y, errors='coerce')


y = y.fillna(y.mean())


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, ['year', 'kms_driven']),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['name', 'company', 'fuel_type'])
    ])

model = Pipeline(steps=[('preprocessor', preprocessor),
                         ('regressor', LinearRegression())])


model.fit(X_train, y_train)


y_pred = model.predict(X_test)


mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('Mean Squared Error:', mse)
print('R-squared:', r2)

example = pd.DataFrame({
    'name': ['Ford EcoSport'],
    'company': ['Ford'],
    'year': [2017],
    'kms_driven': [39000],
    'fuel_type': ['Petrol']
})

predicted_price = model.predict(example)
print('Predicted Price for car:', predicted_price[0])

Missing values in the dataset:
name           0
company        0
year           0
Price          0
kms_driven    52
fuel_type     55
dtype: int64
Mean Squared Error: 38097261006.51037
R-squared: 0.6657668953574922
Predicted Price for car: 490000.63535580435


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['year'] = pd.to_numeric(X['year'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['kms_driven'] = pd.to_numeric(X['kms_driven'], errors='coerce')


In [23]:
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('Mean Squared Error:', mse)
print('R-squared:', r2)


Mean Squared Error: 38097261006.51037
R-squared: 0.6657668953574922
