In [62]:
# Importing the required packages and libraries
# we will need numpy and pandas later
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

cars_train_df = pd.read_csv('train.csv')
cars_test_df = pd.read_csv('test.csv')

# Checking the first few rows to confirm it loaded correctly
print(cars_train_df.head())

   id          brand              model  model_year  milage      fuel_type  \
0   0           MINI      Cooper S Base        2007  213000       Gasoline   
1   1        Lincoln              LS V8        2002  143250       Gasoline   
2   2      Chevrolet  Silverado 2500 LT        2002  136731  E85 Flex Fuel   
3   3        Genesis   G90 5.0 Ultimate        2017   19500       Gasoline   
4   4  Mercedes-Benz        Metris Base        2021    7388       Gasoline   

                                              engine  \
0       172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel   
1       252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel   
2  320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...   
3       420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel   
4       208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel   

                     transmission ext_col int_col  \
0                             A/T  Yellow    Gray   
1                             A/T  Silver   Beige   
2                             A/T  

In [64]:
# Creating the Feature Matrix for iris dataset:

# create a python list of feature names that would like to pick from the dataset:
# Start by selecting the relevant columns for prediction
feature_cols = [
    'brand',       # Categorical
    'model',       # Categorical
    'model_year',  # Numeric
    'milage',      # Numeric
    'fuel_type',   # Categorical
    'engine',      # categorical
    'transmission', # Categorical
    'ext_col',     # Categorical (possibly useful for aesthetics affecting price)
    'int_col',     # Categorical
    'accident',    # Categorical
    'clean_title'  # Binary/Categorical
]

# use the above list to select the features from the original DataFrame
X = cars_train_df[feature_cols]  

# print the first 5 rows
X.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title
0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes
1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes
2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes
3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes
4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes


In [66]:
# checking the size of Feature Matix X:

print(X.shape)

(188533, 11)


In [68]:
# select a Series of labels (the last column) from the DataFrame
y = cars_train_df['price']

# checking the label vector by printing every 10 values
y[::10]

0          4200
10         7950
20        41998
30        39499
40        12000
          ...  
188490    60867
188500     4300
188510    16499
188520    39998
188530    86900
Name: price, Length: 18854, dtype: int64

In [80]:
# we need to convert to binary for this most basic version
# using one hot encoding
# Check the shape of the new feature matrix
X_encoded = pd.get_dummies(X, drop_first=True)

print(X_encoded.shape)

(188533, 3601)


In [74]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Standardizing the features using StandardScaler (optional, but often helps with convergence)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [76]:
my_linreg = LinearRegression()

my_linreg.fit(X_train_scaled, y_train)

# Making predictions on the test data
y_predict_lr = my_linreg.predict(X_test_scaled)

print(y_predict_lr)

[58228.72403641 69668.81707681 56693.17747579 ... 28216.02590084
 15204.56384298 22065.90570781]


In [78]:
# Calculate the Mean Squared Error (MSE) to evaluate the model's performance
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_predict_lr)
print("Mean Squared Error:", mse)

# Calculate the Root Mean Squared Error (RMSE) for easier interpretation
rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)

Mean Squared Error: 1.1628486743771385e+33
Root Mean Squared Error: 3.410056706826352e+16
