# Car Prices Prediction using Linear Regression

**Step 1: Import necessary Libraries**

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import joblib

**Step 2: Load the Dataset**

In [2]:
# Load the dataset
file_path = 'data/Nigerian_Car_Prices.csv'
car_prices = pd.read_csv(file_path)

# Display the first few rows of the dataset
car_prices.head()



Unnamed: 0.1,Unnamed: 0,Make,Year of manufacture,Condition,Mileage,Engine Size,Fuel,Transmission,Price,Build
0,0,Toyota,2007.0,Nigerian Used,166418.0,2400.0,Petrol,Automatic,3120000,
1,1,Lexus,,,138024.0,,,Automatic,5834000,
2,2,Mercedes-Benz,2008.0,Nigerian Used,376807.0,3000.0,Petrol,Automatic,3640000,
3,3,Lexus,,,213362.0,,,Automatic,3594000,
4,4,Mercedes-Benz,,,106199.0,,,Automatic,8410000,


**Step 3: Handle Missing Values**

In [3]:
# Check for missing values
car_prices.isnull().sum()

# Handle Missing Values
car_prices.fillna({
    'Year of manufacture': car_prices['Year of manufacture'].mode()[0],
    'Condition': 'Unknown',
    'Mileage': car_prices['Mileage'].mean(),
    'Engine Size': car_prices['Engine Size'].mean(),
    'Fuel': 'Unknown',
    'Build': 'Unknown'
}, inplace=True)

# Clean the 'Price' column
car_prices['Price'] = car_prices['Price'].str.replace(',', '').astype(float)

# Check again for missing values
car_prices.isnull().sum()


Unnamed: 0              0
Make                    0
Year of manufacture     0
Condition               0
Mileage                 0
Engine Size             0
Fuel                    0
Transmission           20
Price                   0
Build                   0
dtype: int64

**Step 4: Encode Categorical Variables**

In [7]:
# One-hot encode categorical variables
car_prices_encoded = pd.get_dummies(car_prices, columns=['Make', 'Condition', 'Fuel', 'Transmission', 'Build'], drop_first=True)

# Save the one-hot encoder columns
one_hot_columns = car_prices_encoded.columns.tolist()
one_hot_columns.remove('Price')

# Display the first few rows of the encoded dataset
car_prices_encoded.head()


Unnamed: 0.1,Unnamed: 0,Year of manufacture,Mileage,Engine Size,Price,Make_Audi,Make_BMW,Make_Bentley,Make_Buick,Make_Cadillac,...,Condition_Nigerian Used,Condition_Unknown,Fuel_Electric,Fuel_Hybrid,Fuel_Petrol,Fuel_Unknown,Transmission_Automatic,Transmission_CVT,Transmission_Manual,Build_Unknown
0,0,2007.0,166418.0,2400.0,3120000.0,False,False,False,False,False,...,True,False,False,False,True,False,True,False,False,True
1,1,2007.0,138024.0,3274.976562,5834000.0,False,False,False,False,False,...,False,True,False,False,False,True,True,False,False,True
2,2,2008.0,376807.0,3000.0,3640000.0,False,False,False,False,False,...,True,False,False,False,True,False,True,False,False,True
3,3,2007.0,213362.0,3274.976562,3594000.0,False,False,False,False,False,...,False,True,False,False,False,True,True,False,False,True
4,4,2007.0,106199.0,3274.976562,8410000.0,False,False,False,False,False,...,False,True,False,False,False,True,True,False,False,True


**Step 5: Split the Data**

In [8]:
# Given 'Price' is the target variable and others are features
X = car_prices_encoded.drop(columns='Price')
y = car_prices_encoded['Price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



**Step 6: Train the Model**

In [9]:
# Initialize the model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("Root Mean Square Error:", rmse)

# Save the trained model and the columns
joblib.dump((model, one_hot_columns), "Nigerian_Car_Prices_model.joblib")

Root Mean Square Error: 3503647.542453334


['Nigerian_Car_Prices_model.joblib']