## **CAR PRICES PREDICTION**

In [99]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder

In [100]:
cars = pd.read_csv('carprices.csv')
cars

Unnamed: 0,Car Model,Mileage,Sell Price($),Age(yrs)
0,BMW X5,69000,18000,6
1,BMW X5,35000,34000,3
2,BMW X5,57000,26100,5
3,BMW X5,22500,40000,2
4,BMW X5,46000,31500,4
5,Audi A5,59000,29400,5
6,Audi A5,52000,32000,5
7,Audi A5,72000,19300,6
8,Audi A5,91000,12000,8
9,Mercedez Benz C class,67000,22000,6


This is a dataset for different cars, their mileage, selling price and their age. <br> I am going to create a model to predict their prices.

In [101]:
cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13 entries, 0 to 12
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Car Model      13 non-null     object
 1   Mileage        13 non-null     int64 
 2   Sell Price($)  13 non-null     int64 
 3   Age(yrs)       13 non-null     int64 
dtypes: int64(3), object(1)
memory usage: 544.0+ bytes


In [102]:
print('Missing Values:')
print()
print(cars.isna().sum())
print()
print('Number of duplicates found:', cars.duplicated().sum())

Missing Values:

Car Model        0
Mileage          0
Sell Price($)    0
Age(yrs)         0
dtype: int64

Number of duplicates found: 0


The dataset does not have any missing values and no duplicates.

In [103]:
cars.describe()

Unnamed: 0,Mileage,Sell Price($),Age(yrs)
count,13.0,13.0,13.0
mean,60884.615385,26023.076923,5.307692
std,19185.665055,8003.661021,1.652504
min,22500.0,12000.0,2.0
25%,52000.0,20000.0,5.0
50%,59000.0,26100.0,5.0
75%,72000.0,32000.0,6.0
max,91000.0,40000.0,8.0


#### **Encoding using pandas get_dummies**

In [104]:
dummies = pd.get_dummies(cars['Car Model'])
data_merged = pd.concat([cars, dummies], axis='columns')
data_merged

Unnamed: 0,Car Model,Mileage,Sell Price($),Age(yrs),Audi A5,BMW X5,Mercedez Benz C class
0,BMW X5,69000,18000,6,0,1,0
1,BMW X5,35000,34000,3,0,1,0
2,BMW X5,57000,26100,5,0,1,0
3,BMW X5,22500,40000,2,0,1,0
4,BMW X5,46000,31500,4,0,1,0
5,Audi A5,59000,29400,5,1,0,0
6,Audi A5,52000,32000,5,1,0,0
7,Audi A5,72000,19300,6,1,0,0
8,Audi A5,91000,12000,8,1,0,0
9,Mercedez Benz C class,67000,22000,6,0,0,1


In [105]:
# We now drop the car model column since we have encoded columns.
data_merged.drop(['Car Model'], axis=1)

Unnamed: 0,Mileage,Sell Price($),Age(yrs),Audi A5,BMW X5,Mercedez Benz C class
0,69000,18000,6,0,1,0
1,35000,34000,3,0,1,0
2,57000,26100,5,0,1,0
3,22500,40000,2,0,1,0
4,46000,31500,4,0,1,0
5,59000,29400,5,1,0,0
6,52000,32000,5,1,0,0
7,72000,19300,6,1,0,0
8,91000,12000,8,1,0,0
9,67000,22000,6,0,0,1


#### **Encoding using sklearn OneHotEncoder**

In [106]:
data_copy = cars
data_copy

Unnamed: 0,Car Model,Mileage,Sell Price($),Age(yrs)
0,BMW X5,69000,18000,6
1,BMW X5,35000,34000,3
2,BMW X5,57000,26100,5
3,BMW X5,22500,40000,2
4,BMW X5,46000,31500,4
5,Audi A5,59000,29400,5
6,Audi A5,52000,32000,5
7,Audi A5,72000,19300,6
8,Audi A5,91000,12000,8
9,Mercedez Benz C class,67000,22000,6


In [107]:
cat_columns = ["Car Model"]

ohe = OneHotEncoder(sparse=False)
# Fit encoder
ohe.fit(data_copy[cat_columns])

# Get new columns
new_columns = ohe.get_feature_names(input_features=cat_columns)

# Transform
columns = pd.DataFrame(ohe.fit_transform(data_copy[cat_columns]),
                       columns=new_columns,
                       index=data_copy.index)

# Replace the columns with transformed ones
new_data = pd.concat([data_copy.drop(cat_columns, axis=1), columns], axis=1)

new_data

Unnamed: 0,Mileage,Sell Price($),Age(yrs),Car Model_Audi A5,Car Model_BMW X5,Car Model_Mercedez Benz C class
0,69000,18000,6,0.0,1.0,0.0
1,35000,34000,3,0.0,1.0,0.0
2,57000,26100,5,0.0,1.0,0.0
3,22500,40000,2,0.0,1.0,0.0
4,46000,31500,4,0.0,1.0,0.0
5,59000,29400,5,1.0,0.0,0.0
6,52000,32000,5,1.0,0.0,0.0
7,72000,19300,6,1.0,0.0,0.0
8,91000,12000,8,1.0,0.0,0.0
9,67000,22000,6,0.0,0.0,1.0


In [109]:
df = cars
# Create a OneHotEncoder object and specify the column to encode
one_hot_encoder = OneHotEncoder(sparse=False, drop='first')  # drop='first' to avoid multicollinearity

# Encode the "Car Model" column and concatenate with the original DataFrame
car_model_encoded = one_hot_encoder.fit_transform(df[['Car Model']])
encoded_df = pd.concat([df, pd.DataFrame(car_model_encoded, columns=one_hot_encoder.get_feature_names_out(['Car Model']))], axis=1)

encoded_df

Unnamed: 0,Car Model,Mileage,Sell Price($),Age(yrs),Car Model_BMW X5,Car Model_Mercedez Benz C class
0,BMW X5,69000,18000,6,1.0,0.0
1,BMW X5,35000,34000,3,1.0,0.0
2,BMW X5,57000,26100,5,1.0,0.0
3,BMW X5,22500,40000,2,1.0,0.0
4,BMW X5,46000,31500,4,1.0,0.0
5,Audi A5,59000,29400,5,0.0,0.0
6,Audi A5,52000,32000,5,0.0,0.0
7,Audi A5,72000,19300,6,0.0,0.0
8,Audi A5,91000,12000,8,0.0,0.0
9,Mercedez Benz C class,67000,22000,6,0.0,1.0
