In [1]:
import itertools
import pandas as pd
import numpy as np
import scipy.stats
import re
import math
import matplotlib as mlp
import matplotlib.pyplot as plt
import matplotlib.backends.backend_agg
import matplotlib.figure
import seaborn as sns
import datetime
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, LabelBinarizer
from sklearn.metrics import roc_curve, roc_auc_score

In [21]:
cars_dataset = pd.read_csv('C:/Users/.../cars-features-dataset.csv')
cars_dataset.drop(columns='Unnamed: 0', inplace=True)
print(cars_dataset.shape)
cars_dataset.head()

(16920, 18)


Unnamed: 0,Make,Model,Year,Mileage,Transmission,Engine,Exterior Color,Interior Color,MPG,Fuel Type,Drive Type,Location (City),Location (State),Style,Condition (Accidents),Options Level,Bed Length,Price
0,Ford,Focus,2014.0,35795,Automatic,2.0L Inline-4 Gas,,Charcoal Black,26 cty / 36 hwy,Gas,FWD,Tacoma,WA,SE Hatchback,2 reported accidents,Standard,,7999
1,Ford,F-150,2015.0,128146,Automatic,2.7L V-6 Gas Turbocharged,Unknown,Medium Earth Gray,19 cty / 26 hwy,Gas,RWD,Waxahachie,TX,XL SuperCrew 6.5\' Box RWD,0 reported accidents,,Standard Bed,15991
2,Mercedes-Benz,E-Class,2004.0,135000,Automatic,5.0L V-8 Gas,Black,Beige,16 cty / 22 hwy,Gas,AWD,Covina,CA,E 500 4MATIC Wagon,0 reported accidents,Standard,,5950
3,Hyundai,Elantra,2011.0,135670,Automatic,1.8L Inline-4 Gas,,Gray,28 cty / 38 hwy,Gas,FWD,Miami,FL,,0 reported accidents,Standard,,3999
4,Toyota,Prius,2012.0,152543,Automatic,1.8L Inline-4 Hybrid,White,Biege,51 cty / 48 hwy,Hybrid,FWD,Byhalia,MS,Five,1 reported accidents,Standard,,6500


## Exploratory Data Analysis

In [26]:
cars_dataset.isnull().sum(axis=0)

Make                       896
Model                      923
Year                       822
Mileage                    349
Transmission               386
Engine                    3545
Exterior Color            5840
Interior Color            1285
MPG                        367
Fuel Type                  336
Drive Type                 284
Location (City)            713
Location (State)           748
Style                     1747
Condition (Accidents)      781
Options Level             2241
Bed Length               15641
Price                      916
dtype: int64

In [22]:
cols_by_nulls = cars_dataset.isnull().sum(axis=0).sort_values(ascending=False)
cols_by_nulls

Bed Length               15641
Exterior Color            5840
Engine                    3545
Options Level             2241
Style                     1747
Interior Color            1285
Model                      923
Price                      916
Make                       896
Year                       822
Condition (Accidents)      781
Location (State)           748
Location (City)            713
Transmission               386
MPG                        367
Mileage                    349
Fuel Type                  336
Drive Type                 284
dtype: int64

This high quantity of allegedly null values is due to lack of information when inputting specifications in each car listing from its owner or merchant.
Some of them will be dropped but most of them will be labeled as another data.

In [23]:
cars_dataset.dropna(subset=['Make', 'Model'], inplace=True)
cars_dataset.shape

(15983, 18)

Make and model are too important to have as an specific information for each row, so the ones were Make and Model are missing were dropped.

In [24]:
print("'Bed Length' column has", cars_dataset["Bed Length"].isnull().sum(), "null values.")
print("The other ones (", cars_dataset["Bed Length"].notnull().sum(), ") are:", sep="")
cars_dataset['Bed Length'].value_counts()

'Bed Length' column has 14752 null values.
The other ones (1231) are:


Short Bed       576
Standard Bed    536
Long Bed        119
Name: Bed Length, dtype: int64

As seen above the bed length is null for almost all cases, so it will be discarded.

In [25]:
cars_dataset.drop(columns='Bed Length', inplace=True)
cars_dataset.columns

Index(['Make', 'Model', 'Year', 'Mileage', 'Transmission', 'Engine',
       'Exterior Color', 'Interior Color', 'MPG', 'Fuel Type', 'Drive Type',
       'Location (City)', 'Location (State)', 'Style', 'Condition (Accidents)',
       'Options Level', 'Price'],
      dtype='object')

In [26]:
cols_by_nulls = cols_by_nulls[1:]
print("Exterior Color has", cols_by_nulls[0], "null values")

Exterior Color has 5840 null values


In [36]:
print(cars_dataset['Exterior Color'].value_counts().sum(), "not null values and values count:", cars_dataset['Exterior Color'].nunique())
cars_dataset['Exterior Color'].value_counts()

10747 not null values and values count: 569


Black                  775
Oxford White           426
Gray                   398
Summit White           393
White                  371
                      ... 
Ultimate Black           1
Sonora Gold Pearl        1
Black Stone              1
Deep Black Metallic      1
Chromium                 1
Name: Exterior Color, Length: 569, dtype: int64

In [8]:
cars_dataset.dtypes

Make                      object
Model                     object
Year                     float64
Mileage                   object
Transmission              object
Engine                    object
Exterior Color            object
Interior Color            object
MPG                       object
Fuel Type                 object
Drive Type                object
Location (City)           object
Location (State)          object
Style                     object
Condition (Accidents)     object
Options Level             object
Bed Length                object
Price                     object
dtype: object