In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning libraries
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.svm import SVR

In [None]:
# Load the dataset
df = pd.read_csv("/content/drive/MyDrive/car_prices.csv")

# Display the first few rows
print(df.head())

   year   make                model        trim   body transmission  \
0  2015    Kia              Sorento          LX    SUV    automatic   
1  2015    Kia              Sorento          LX    SUV    automatic   
2  2014    BMW             3 Series  328i SULEV  Sedan    automatic   
3  2015  Volvo                  S60          T5  Sedan    automatic   
4  2014    BMW  6 Series Gran Coupe        650i  Sedan    automatic   

                 vin state  condition  odometer  color interior  \
0  5xyktca69fg566472    ca        5.0   16639.0  white    black   
1  5xyktca69fg561319    ca        5.0    9393.0  white    beige   
2  wba3c1c51ek116351    ca       45.0    1331.0   gray    black   
3  yv1612tb4f1310987    ca       41.0   14282.0  white    black   
4  wba6b2c57ed129731    ca       43.0    2641.0   gray    black   

                                   seller      mmr  sellingprice  \
0                 kia motors america  inc  20500.0       21500.0   
1                 kia motors ameri

In [None]:
# Check the shape of the dataset
print(f"Dataset contains {df.shape[0]} rows and {df.shape[1]} columns.")

# Get info about data types and missing values
df.info()

Dataset contains 558837 rows and 16 columns.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 558837 entries, 0 to 558836
Data columns (total 16 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   year          558837 non-null  int64  
 1   make          548536 non-null  object 
 2   model         548438 non-null  object 
 3   trim          548186 non-null  object 
 4   body          545642 non-null  object 
 5   transmission  493485 non-null  object 
 6   vin           558833 non-null  object 
 7   state         558837 non-null  object 
 8   condition     547017 non-null  float64
 9   odometer      558743 non-null  float64
 10  color         558088 non-null  object 
 11  interior      558088 non-null  object 
 12  seller        558837 non-null  object 
 13  mmr           558799 non-null  float64
 14  sellingprice  558825 non-null  float64
 15  saledate      558825 non-null  object 
dtypes: float64(4), int64(1), object(11)
memory usag

In [None]:
# Statistical summary
print(df.describe())

                year      condition       odometer            mmr  \
count  558837.000000  547017.000000  558743.000000  558799.000000   
mean     2010.038927      30.672365   68320.017767   13769.377495   
std         3.966864      13.402832   53398.542821    9679.967174   
min      1982.000000       1.000000       1.000000      25.000000   
25%      2007.000000      23.000000   28371.000000    7100.000000   
50%      2012.000000      35.000000   52254.000000   12250.000000   
75%      2013.000000      42.000000   99109.000000   18300.000000   
max      2015.000000      49.000000  999999.000000  182000.000000   

        sellingprice  
count  558825.000000  
mean    13611.358810  
std      9749.501628  
min         1.000000  
25%      6900.000000  
50%     12100.000000  
75%     18200.000000  
max    230000.000000  


In [None]:
# Missing values per column
print(df.isnull().sum())

year                0
make            10301
model           10399
trim            10651
body            13195
transmission    65352
vin                 4
state               0
condition       11820
odometer           94
color             749
interior          749
seller              0
mmr                38
sellingprice       12
saledate           12
dtype: int64


In [None]:
# Drop missing values
df.dropna(inplace=True)

# Missing values per column
print(df.isnull().sum())

year            0
make            0
model           0
trim            0
body            0
transmission    0
vin             0
state           0
condition       0
odometer        0
color           0
interior        0
seller          0
mmr             0
sellingprice    0
saledate        0
dtype: int64


In [None]:
# Drop the columns that are unrelated
columns_to_drop = ['vin', 'state', 'sellingprice', 'saledate']
df = df.drop(columns=[col for col in columns_to_drop if col in df.columns], axis=1)

# Rename column 'mmr' to 'estimated_value'
if 'mmr' in df.columns:
    df = df.rename(columns={'mmr': 'estimated_value'})
else:
    print("'mmr' column not found in the dataset.")

# Verify if the modifications are successful
print(df.head())

   year   make                model        trim   body transmission  \
0  2015    Kia              Sorento          LX    SUV    automatic   
1  2015    Kia              Sorento          LX    SUV    automatic   
2  2014    BMW             3 Series  328i SULEV  Sedan    automatic   
3  2015  Volvo                  S60          T5  Sedan    automatic   
4  2014    BMW  6 Series Gran Coupe        650i  Sedan    automatic   

   condition  odometer  color interior  \
0        5.0   16639.0  white    black   
1        5.0    9393.0  white    beige   
2       45.0    1331.0   gray    black   
3       41.0   14282.0  white    black   
4       43.0    2641.0   gray    black   

                                   seller  estimated_value  
0                 kia motors america  inc          20500.0  
1                 kia motors america  inc          20800.0  
2  financial services remarketing (lease)          31900.0  
3                 volvo na rep/world omni          27500.0  
4  financial se

In [None]:
# Encoding Categorical Variables
# Apply Label Encoding
label_encoder = LabelEncoder()

# Apply on high cardinality categorical columns
for col in df.select_dtypes(include=['object']).columns:
    df[col] = label_encoder.fit_transform(df[col])

# Apply sparse one-hot encoding to remaining categorical columns
df_encoded = pd.get_dummies(df, drop_first=True, sparse=True)

# Display the first few rows
print(df.head())

   year  make  model  trim  body  transmission  condition  odometer  color  \
0  2015    24    637   818    35             0        5.0   16639.0     17   
1  2015    24    637   818    35             0        5.0    9393.0     17   
2  2014     3      8   253    36             0       45.0    1331.0      7   
3  2015    51    575  1212    36             0       41.0   14282.0     17   
4  2014     3     33   335    36             0       43.0    2641.0      7   

   interior  seller  estimated_value  
0         1    5943          20500.0  
1         0    5943          20800.0  
2         1    4090          31900.0  
3         1   11541          27500.0  
4         1    4090          66000.0  


In [None]:
# Get info about data types and missing values
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 472325 entries, 0 to 558836
Data columns (total 12 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   year             472325 non-null  int64  
 1   make             472325 non-null  int64  
 2   model            472325 non-null  int64  
 3   trim             472325 non-null  int64  
 4   body             472325 non-null  int64  
 5   transmission     472325 non-null  int64  
 6   condition        472325 non-null  float64
 7   odometer         472325 non-null  float64
 8   color            472325 non-null  int64  
 9   interior         472325 non-null  int64  
 10  seller           472325 non-null  int64  
 11  estimated_value  472325 non-null  float64
dtypes: float64(3), int64(9)
memory usage: 46.8 MB


In [None]:
# Separate features (X) and target variable (y)
X = df.drop(['estimated_value'], axis=1)  # Drop 'estimated_value' from features
y = df['estimated_value']  # Use 'estimated_value' as the target variable

# Split the data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Check the shapes of the resulting sets
print(f"Training set: {X_train.shape[0]} samples")
print(f"Testing set: {X_test.shape[0]} samples")

Training set: 377860 samples
Testing set: 94465 samples


In [None]:
# Feature Scaling
# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform it
X_train_scaled = scaler.fit_transform(X_train)

# Apply the scaling transformation to the test data
X_test_scaled = scaler.transform(X_test)

In [None]:
# Create an SVR Model
svr_model = SVR(kernel='rbf' , C=1, epsilon=0.1)
# Train the Model
svr_model.fit(X_train_scaled, y_train)

In [None]:
# Make predictions on the scaled test data
y_pred = svr_model.predict(X_test_scaled)

# Display all predicted values
print("Predicted car prices:", y_pred)

Predicted car prices: [ 6421.08872928 15036.24267761 15644.53519686 ... 12685.96729953
 14210.99545885 10851.56700086]


In [None]:
# Calculate the Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")

Mean Squared Error: 63664462.16


In [None]:
# Calculate the RMSE
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error: {rmse:.2f}")

Root Mean Squared Error: 7979.00


In [None]:
# Calculate the R-squared error
r2 = r2_score(y_test, y_pred)
print(f"R-Squared Error: {r2:.2f}")

R-Squared Error: 0.29
