# **Car Price Prediction**


## Importing Libraries


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import pickle

In [2]:
# Function to load data from a CSV file
def load_data(file_path):
    # Read the dataset from the given file path
    dataset = pd.read_csv(file_path)
    return dataset

# Function for preprocessing the dataset
def preprocess_data(dataset):
    # Selecting relevant features and target variable (indices should be adjusted based on dataset)
    X = dataset.iloc[:, [1, 3, 4, 6]].values
    y = dataset.iloc[:, 2].values

    # Initializing label encoders for categorical features
    lb = LabelEncoder()
    lb1 = LabelEncoder()

    # Transforming categorical features into numerical
    X[:, 2] = lb.fit_transform(X[:, 2])
    X[:, 3] = lb1.fit_transform(X[:, 3])

    return X, y, lb, lb1

# Function to split dataset into training and testing sets
def split_data(X, y):
    # Splitting the dataset into training (95%) and testing (5%) sets
    return train_test_split(X, y, test_size=0.05, random_state=0)

# Function to train the RandomForestRegressor model
def train_model(X_train, y_train):
    # Initialize the RandomForestRegressor with a fixed random state for reproducibility
    model = RandomForestRegressor(random_state=0)

    # Hyperparameter grid for tuning the model
    param_grid = {
        'n_estimators': [100, 200, 300], 
        'max_depth': [None, 10, 20]
    }

    # Grid search for hyperparameter tuning with 5-fold cross-validation
    grid_search = GridSearchCV(model, param_grid, cv=5)
    grid_search.fit(X_train, y_train)

    # Returning the best model found by grid search
    return grid_search.best_estimator_

# Function to evaluate the model using different metrics
def evaluate_model(model, X_test, y_test):
    # Making predictions on the test set
    y_pred = model.predict(X_test)

    # Calculating evaluation metrics
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    return mse, mae, r2

# Function to serialize (save) the model to a file
def serialize_model(model, filename):
    with open(filename, 'wb') as file:
        pickle.dump(model, file)

# Function to make predictions on new, unseen data
def predict_new_data(model, new_data, lb, lb1):
    try:
        # Transforming the categorical features using the previously fitted label encoders
        new_data[2] = lb.transform([new_data[2]])[0]
        new_data[3] = lb1.transform([new_data[3]])[0]

        # Making a prediction using the trained model
        prediction = model.predict([new_data])
        return prediction
    except Exception as e:
        # In case of any errors, return the error message
        return str(e)



In [3]:
# Main workflow
dataset = load_data('car_data.csv')
X, y, lb, lb1 = preprocess_data(dataset)
X_train, X_test, y_train, y_test = split_data(X, y)

model = train_model(X_train, y_train)
mse, mae, r2 = evaluate_model(model, X_test, y_test)
print(f'MSE: {mse}, MAE: {mae}, R² Score: {r2*100}%')

MSE: 56365266995.58914, MAE: 144893.3964224964, R² Score: 86.10881587234663%


In [4]:
# Serialize the model and label encoders for future use
serialize_model(model, 'regressor.pkl')
serialize_model(lb, 'lb.pkl')
serialize_model(lb1, 'lb1.pkl')

In [5]:
# Example: Predicting for new data
new_data = [2017, 7000, "Petrol", "Manual"]
prediction = predict_new_data(model, new_data, lb, lb1)
print("Prediction for new data:", prediction)

Prediction for new data: [587559.61072822]
