<a href="https://colab.research.google.com/github/Ankushsb62/OASIS-INFOBYTE-DATA-SCIENCE/blob/PROJECT-3/Car_Price_Prediction_with_Machine_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
# Load the dataset
df = pd.read_csv('car data.csv')
print(df.head())
print(df.info())

  Car_Name  Year  Selling_Price  Present_Price  Driven_kms Fuel_Type  \
0     ritz  2014           3.35           5.59       27000    Petrol   
1      sx4  2013           4.75           9.54       43000    Diesel   
2     ciaz  2017           7.25           9.85        6900    Petrol   
3  wagon r  2011           2.85           4.15        5200    Petrol   
4    swift  2014           4.60           6.87       42450    Diesel   

  Selling_type Transmission  Owner  
0       Dealer       Manual      0  
1       Dealer       Manual      0  
2       Dealer       Manual      0  
3       Dealer       Manual      0  
4       Dealer       Manual      0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       301 non-null    object 
 1   Year           301 non-null    int64  
 2   Selling_Price  301 non-null    float64
 3   Present_Price  3

In [4]:
# Handle missing values if any
df = df.dropna()

In [5]:
# Convert 'Year' to age of the car
current_year = 2024
df['Age'] = current_year - df['Year']

In [6]:
# Select features and target
features = ['Age', 'Present_Price', 'Driven_kms', 'Fuel_Type', 'Selling_type', 'Transmission', 'Owner']
target = 'Selling_Price'

In [7]:
X = df[features]
y = df[target]

In [8]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Define preprocessing steps
numeric_features = ['Age', 'Present_Price', 'Driven_kms', 'Owner']
categorical_features = ['Fuel_Type', 'Selling_type', 'Transmission']

In [11]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [12]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [13]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [14]:
# Create a pipeline with preprocessor and model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))])

In [15]:
# Fit the model
model.fit(X_train, y_train)

In [16]:
# Make predictions
y_pred = model.predict(X_test)

In [17]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [18]:
print(f"Mean Squared Error: {mse}")
print(f"R-squared Score: {r2}")

Mean Squared Error: 0.8674470426229501
R-squared Score: 0.9623431720667264


In [19]:
# Get feature importances
feature_importance = model.named_steps['regressor'].feature_importances_
feature_names = model.named_steps['preprocessor'].get_feature_names_out()

In [20]:
# Create a dataframe of feature importances
importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importance})
importance_df = importance_df.sort_values('importance', ascending=False)

In [21]:
print("Top 10 most important features:")
print(importance_df.head(10))

Top 10 most important features:
                         feature  importance
1             num__Present_Price    0.880822
0                       num__Age    0.055407
2                num__Driven_kms    0.035411
10      cat__Transmission_Manual    0.011862
9    cat__Transmission_Automatic    0.005382
5          cat__Fuel_Type_Diesel    0.003920
6          cat__Fuel_Type_Petrol    0.002982
8   cat__Selling_type_Individual    0.002933
7       cat__Selling_type_Dealer    0.001082
3                     num__Owner    0.000186


In [23]:
# Example of a new car data
new_car = pd.DataFrame({
    'Age': [3],
    'Present_Price': [10.5],
    'Driven_kms': [20000],
    'Fuel_Type': ['Petrol'],
    'Selling_type': ['Dealer'],
    'Transmission': ['Manual'],
    'Owner': [0]
})

In [24]:
# Make a prediction
predicted_price = model.predict(new_car)
print(f"Predicted selling price: {predicted_price[0]:.2f}")

Predicted selling price: 8.29
