In [103]:
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [104]:
url = "https://raw.githubusercontent.com/Bhadiyadra-Sarthak/EECS-3401-final/main/car_web_scraped_dataset.csv"
cars = pd.read_csv(url, sep=',')

#Removing all the data before year 2021
cars = cars[(cars.year >= 2021)]

#backup of the current dataset
cars_backup = cars

In [105]:
cars

Unnamed: 0,name,year,miles,color,condition,price
0,Kia Forte,2022,"41,406 miles","Gray exterior, Black interior","No accidents reported, 1 Owner","$15,988"
1,Chevrolet Silverado 1500,2021,"15,138 miles","White exterior, Black interior","1 accident reported, 1 Owner","$38,008"
2,Toyota RAV4,2022,"32,879 miles","Silver exterior, Unknown interior","No accidents reported, 1 Owner","$24,988"
13,Mercedes-Benz GLC,2021,"38,760 miles","Black exterior, Black interior","No accidents reported, 1 Owner","$29,998"
15,Honda Civic,2021,"22,850 miles","White exterior, Black interior","No accidents reported, 1 Owner","$21,998"
...,...,...,...,...,...,...
2831,Ford Escape,2022,"26,521 miles","Gray exterior, Black interior","No accidents reported, 1 Owner","$28,468"
2832,Volkswagen Tiguan,2021,"29,540 miles","Black exterior, Gray interior","No accidents reported, 1 Owner","$19,500"
2837,Mercedes-Benz GLC,2022,"27,894 miles","Gray exterior, Gray interior","No accidents reported, 2 Owners","$29,999"
2838,Honda CR-V,2021,"50,220 miles","Gray exterior, Unknown interior","No accidents reported, 1 Owner","$22,992"


In [106]:
cars.head()

Unnamed: 0,name,year,miles,color,condition,price
0,Kia Forte,2022,"41,406 miles","Gray exterior, Black interior","No accidents reported, 1 Owner","$15,988"
1,Chevrolet Silverado 1500,2021,"15,138 miles","White exterior, Black interior","1 accident reported, 1 Owner","$38,008"
2,Toyota RAV4,2022,"32,879 miles","Silver exterior, Unknown interior","No accidents reported, 1 Owner","$24,988"
13,Mercedes-Benz GLC,2021,"38,760 miles","Black exterior, Black interior","No accidents reported, 1 Owner","$29,998"
15,Honda Civic,2021,"22,850 miles","White exterior, Black interior","No accidents reported, 1 Owner","$21,998"


In [107]:
cars.describe()

Unnamed: 0,year
count,1164.0
mean,2021.647766
std,0.802824
min,2021.0
25%,2021.0
50%,2021.0
75%,2022.0
max,2024.0


In [108]:
cars.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1164 entries, 0 to 2839
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   name       1164 non-null   object
 1   year       1164 non-null   int64 
 2   miles      1164 non-null   object
 3   color      1164 non-null   object
 4   condition  1164 non-null   object
 5   price      1164 non-null   object
dtypes: int64(1), object(5)
memory usage: 63.7+ KB


In [109]:
# We will change the object type to int64 for our numerical data such as miles and prices
cars['miles'] = cars['miles'].str.strip('miles').str.replace(',','')
cars['miles'] = cars['miles'].astype('int64')

In [110]:
cars["miles"]

0       41406
1       15138
2       32879
13      38760
15      22850
        ...  
2831    26521
2832    29540
2837    27894
2838    50220
2839    26510
Name: miles, Length: 1164, dtype: int64

In [111]:
cars['price'] = cars['price'].str.replace('$', '').str.replace(',','').str.strip()
cars['price'] = cars['price'].astype('int64')
cars['price']

0       15988
1       38008
2       24988
13      29998
15      21998
        ...  
2831    28468
2832    19500
2837    29999
2838    22992
2839    24135
Name: price, Length: 1164, dtype: int64

In [112]:
cars.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1164 entries, 0 to 2839
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   name       1164 non-null   object
 1   year       1164 non-null   int64 
 2   miles      1164 non-null   int64 
 3   color      1164 non-null   object
 4   condition  1164 non-null   object
 5   price      1164 non-null   int64 
dtypes: int64(3), object(3)
memory usage: 63.7+ KB


In [113]:
cars["name"].value_counts()

name
Toyota Corolla           75
Toyota RAV4              48
Subaru Outback           38
Kia Forte                35
Mercedes-Benz C-Class    29
                         ..
Audi A4                   1
Porsche Cayenne           1
GMC Yukon                 1
Maserati Levante          1
BMW 2 Series              1
Name: count, Length: 169, dtype: int64

In [114]:
# Clearing color column
cars['exterior'] = cars['color'].str.extract(r'(\D+),')[0].astype(object)
cars['interior'] = cars['color'].str.extract(r',(\D+)')[0].astype(object)
cars['exterior'] = cars['exterior'].str.strip('exterior').str.replace(' ','')
cars['interior'] = cars['interior'].str.strip('interior').str.replace(' ','')
cars = cars.drop(columns=['color'])
cars.head()

Unnamed: 0,name,year,miles,condition,price,exterior,interior
0,Kia Forte,2022,41406,"No accidents reported, 1 Owner",15988,Gray,Black
1,Chevrolet Silverado 1500,2021,15138,"1 accident reported, 1 Owner",38008,White,Black
2,Toyota RAV4,2022,32879,"No accidents reported, 1 Owner",24988,Silver,Unknown
13,Mercedes-Benz GLC,2021,38760,"No accidents reported, 1 Owner",29998,Black,Black
15,Honda Civic,2021,22850,"No accidents reported, 1 Owner",21998,White,Black


In [115]:
# Clearnig Condition column
cars['accidents'] = cars['condition'].str.extract(r'(\d+)\s*accident').replace(np.nan,0).astype(int)
cars['previous_owners'] = cars['condition'].str.extract(r'reported,\s*(\d+)').astype(int)
cars = cars.drop(columns=['condition'])

In [117]:
# Finished dataset
cars
cars.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1164 entries, 0 to 2839
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   name             1164 non-null   object
 1   year             1164 non-null   int64 
 2   miles            1164 non-null   int64 
 3   price            1164 non-null   int64 
 4   exterior         1164 non-null   object
 5   interior         1164 non-null   object
 6   accidents        1164 non-null   int32 
 7   previous_owners  1164 non-null   int32 
dtypes: int32(2), int64(3), object(3)
memory usage: 72.8+ KB
