In [1]:
#Importing necessary Libraries
import pandas as pd

In [2]:
#Reading Data
df = pd.read_csv('vehicles_us.csv')
#Looking at Data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    47906 non-null  float64
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     46265 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      43633 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   42258 non-null  object 
 10  is_4wd        25572 non-null  float64
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 5.1+ MB


12 Columns, 51525 Rows.
Mandatory columns seem to be (Price, Model, Condition, Fuel(economy?), Transmission, Type, Date Posted, Days Listed) 8/12
Columns with missing data are (Model Year, Cylinders, Odometer, Paint Color, Is 4WD) 4/12

Model Year will be changed to integer.
Fuel being an object will be further investigated.
Is 4WD should be a boolean.
Price and Cylinders will be investigated for better data type.
Date Posted will be converted to Date data type.

In [3]:
#Getting a sample of data
df.sample(20)

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
26707,18990,2000.0,ford f-350 sd,good,8.0,diesel,118774.0,automatic,truck,,1.0,2018-09-20,48
41623,17990,,chevrolet traverse,good,6.0,gas,52688.0,automatic,SUV,white,1.0,2019-04-13,11
43882,16595,2012.0,bmw x5,excellent,8.0,gas,73705.0,automatic,SUV,white,1.0,2019-01-27,11
44149,22500,2016.0,chevrolet silverado,like new,6.0,gas,13000.0,automatic,pickup,white,,2018-11-06,8
13632,19950,2015.0,ram 1500,like new,8.0,gas,84882.0,automatic,truck,black,1.0,2018-10-27,21
20844,6995,2010.0,honda odyssey,good,6.0,gas,,automatic,mini-van,black,,2019-03-26,10
19594,15500,,chevrolet silverado 1500,excellent,8.0,gas,150000.0,automatic,pickup,grey,1.0,2019-02-10,28
15334,3200,2003.0,toyota camry,good,4.0,gas,,automatic,sedan,black,,2018-12-04,28
37668,1500,2005.0,chrysler town & country,fair,4.0,gas,196000.0,automatic,mini-van,blue,,2018-07-10,12
23223,7900,2015.0,subaru outback,good,4.0,gas,,automatic,wagon,green,1.0,2018-11-26,107


In [4]:
#Printing missing values
print('Missing values:')
print(df.isna().sum())
print()

#Printing unique values in the 'is_4wd' column
print('Unique values in "is_4WD":')
print(df['is_4wd'].unique())

Missing values:
price               0
model_year       3619
model               0
condition           0
cylinders        5260
fuel                0
odometer         7892
transmission        0
type                0
paint_color      9267
is_4wd          25953
date_posted         0
days_listed         0
dtype: int64

Unique values in "is_4WD":
[ 1. nan]


### Fixing Data

In [5]:
#Changing 'date_posted' to datetime data type
df['date_posted'] = pd.to_datetime(df['date_posted'])

#Chagning 'model_year' to int data type
df['model_year'] = df['model_year'].fillna(-1).astype('int64')

#Filling missing values in 'is_4wd' and changing to int data type
df['is_4wd'] = df['is_4wd'].fillna(0).astype('int64')
df.sample(20)

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
10867,17400,2012,toyota tundra,excellent,8.0,gas,86551.0,automatic,pickup,white,1,2018-08-16,15
43265,19500,1993,ford mustang,excellent,8.0,gas,116000.0,manual,coupe,blue,0,2018-06-04,56
12181,7550,2013,nissan maxima,good,6.0,gas,175000.0,automatic,sedan,grey,0,2018-12-20,61
47959,4800,-1,ford ranger,excellent,4.0,gas,,automatic,truck,white,0,2018-12-04,47
22036,5500,2015,nissan sentra,good,4.0,gas,116000.0,automatic,sedan,black,0,2019-03-03,10
2343,2000,1997,toyota 4runner,good,6.0,gas,95000.0,manual,SUV,red,1,2019-04-03,37
37915,23495,2014,toyota tacoma,excellent,6.0,gas,100802.0,automatic,truck,grey,1,2018-09-07,32
39500,2990,1997,ford ranger,excellent,4.0,gas,204432.0,automatic,truck,blue,0,2018-05-03,44
12960,1800,2003,honda accord,excellent,4.0,gas,125000.0,automatic,sedan,grey,0,2018-08-30,4
39749,14000,2006,toyota tacoma,good,6.0,gas,240000.0,automatic,truck,silver,1,2018-05-03,177


### Enrich Data


In [6]:
df['manufacturer'] = df['model'].apply(lambda x: x.split()[0])
df.sample(20)

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed,manufacturer
35336,11900,2003,toyota tacoma,excellent,6.0,gas,183000.0,automatic,pickup,,1,2018-11-10,17,toyota
41122,17490,2016,ford econoline,good,,gas,122974.0,automatic,truck,,0,2018-06-19,24,ford
25315,33990,2018,ford f-150,good,8.0,diesel,23603.0,automatic,truck,silver,1,2018-06-11,22,ford
8158,7498,2013,ford focus,excellent,4.0,gas,78128.0,automatic,hatchback,grey,0,2018-09-30,35,ford
13490,14980,2003,chevrolet silverado 2500hd,like new,8.0,diesel,212000.0,automatic,truck,,1,2019-01-06,73,chevrolet
21347,4999,2010,nissan rogue,good,4.0,gas,168000.0,automatic,SUV,black,0,2019-01-16,14,nissan
49088,2500,2006,nissan sentra,good,4.0,gas,176000.0,automatic,sedan,grey,0,2018-11-24,70,nissan
51316,4999,2010,hyundai elantra,good,4.0,gas,106935.0,automatic,sedan,black,0,2019-02-12,41,hyundai
13379,9800,2014,ford fusion,excellent,4.0,gas,,automatic,sedan,,0,2018-05-05,27,ford
45210,3995,2011,nissan versa,excellent,4.0,gas,155000.0,automatic,hatchback,,0,2018-10-27,44,nissan
