In [1]:
# Import Necessary Libraries.
import pandas as pd
import numpy as np

In [2]:
# Read The DataSet.
df = pd.read_csv('vehicles.csv')

In [3]:
# Show All Columns.
pd.set_option('display.max_columns', None)

In [4]:
# Removing Unnecessary Column From The DataSet.  
df=df.drop(['id','region', 'region_url', 'VIN','url','image_url','description','county'], axis=1)

In [8]:
# Removing Missing Value.
df=df.dropna()

In [9]:
# Showing The DataFrame Shape.
df.shape

(79016, 18)

In [10]:
# Showing The Info of The DataFrame.
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 79016 entries, 31 to 426836
Data columns (total 18 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         79016 non-null  int64  
 1   year          79016 non-null  float64
 2   manufacturer  79016 non-null  object 
 3   model         79016 non-null  object 
 4   condition     79016 non-null  object 
 5   cylinders     79016 non-null  object 
 6   fuel          79016 non-null  object 
 7   odometer      79016 non-null  float64
 8   title_status  79016 non-null  object 
 9   transmission  79016 non-null  object 
 10  drive         79016 non-null  object 
 11  size          79016 non-null  object 
 12  type          79016 non-null  object 
 13  paint_color   79016 non-null  object 
 14  state         79016 non-null  object 
 15  lat           79016 non-null  float64
 16  long          79016 non-null  float64
 17  posting_date  79016 non-null  object 
dtypes: float64(4), int64(1),

In [11]:
# Showing The Describe of The DataSet
df.describe()

Unnamed: 0,price,year,odometer,lat,long
count,79016.0,79016.0,79016.0,79016.0,79016.0
mean,79589.72,2008.474575,124454.1,38.434362,-92.646525
std,13891460.0,9.995538,243161.5,5.653717,17.60246
min,0.0,1900.0,0.0,-81.838232,-159.7199
25%,4950.0,2006.0,73000.0,34.9453,-104.8994
50%,9000.0,2011.0,114000.0,39.3361,-86.65828
75%,17500.0,2014.0,155414.0,42.278811,-80.081908
max,3736929000.0,2022.0,10000000.0,82.252826,139.6917


In [12]:
# Showing If There Any Duplicated Row
sum(df.duplicated())

179

In [48]:
# Delete The Duplicated Row in The DataSet.
df.drop_duplicates(inplace=True)

In [49]:
# Showing The Column-Name in The DataFrame
df.columns

Index(['price', 'year', 'manufacturer', 'model', 'condition', 'cylinders',
       'fuel', 'odometer', 'title_status', 'transmission', 'drive', 'size',
       'type', 'paint_color', 'state', 'lat', 'long', 'posting_date'],
      dtype='object')

In [50]:
# Import Necessary Libraries For Using LabelEncoder.
from sklearn import preprocessing
# LabelEncoder Can Be Used to Normalize Labels.
le = preprocessing.LabelEncoder()

In [51]:
# Fit Label Encoder and Return Encoded Labels.
df[['size','manufacturer', 'model', 'condition','cylinders', 'fuel', 'title_status', 'transmission','drive', 'type', 'paint_color', 'state',
       'posting_date']] = df[['size','manufacturer', 'model', 'condition','cylinders', 'fuel', 'title_status', 'transmission','drive','type', 'paint_color', 'state',
       'posting_date']].apply(le.fit_transform)

In [52]:
# Transform Features By Scaling Each Feature to a Given Range.
df["odometer"] = np.sqrt(preprocessing.minmax_scale(df["odometer"]))

In [53]:
# Ensure That There are No Null Values Left in The DataFrame.
df.isnull().sum(axis = 0)

price           0
year            0
manufacturer    0
model           0
condition       0
cylinders       0
fuel            0
odometer        0
title_status    0
transmission    0
drive           0
size            0
type            0
paint_color     0
state           0
lat             0
long            0
posting_date    0
dtype: int64

In [54]:
# Showing The Info of The DataFrame and Ensure That There are no Object DType Left in The DataFrame.
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 78837 entries, 31 to 426836
Data columns (total 18 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         78837 non-null  int64  
 1   year          78837 non-null  float64
 2   manufacturer  78837 non-null  int64  
 3   model         78837 non-null  int64  
 4   condition     78837 non-null  int64  
 5   cylinders     78837 non-null  int64  
 6   fuel          78837 non-null  int64  
 7   odometer      78837 non-null  float64
 8   title_status  78837 non-null  int64  
 9   transmission  78837 non-null  int64  
 10  drive         78837 non-null  int64  
 11  size          78837 non-null  int64  
 12  type          78837 non-null  int64  
 13  paint_color   78837 non-null  int64  
 14  state         78837 non-null  int64  
 15  lat           78837 non-null  float64
 16  long          78837 non-null  float64
 17  posting_date  78837 non-null  int64  
dtypes: float64(4), int64(14)

In [55]:
# Separate Features and Outcome
X = df.drop('type',axis=1).values
y = df.type.values

In [56]:
# Stratified K-Folds Cross-Validator.
# Provides Train/Test Indices To Split Data in Train/Test Sets.
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=2)
skf.get_n_splits(X, y)

for train_index, test_index in skf.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

TRAIN: [26986 27614 27995 ... 78834 78835 78836] TEST: [    0     1     2 ... 44896 45735 46349]
TRAIN: [    0     1     2 ... 44896 45735 46349] TEST: [26986 27614 27995 ... 78834 78835 78836]


In [57]:
# Showing The Shape of Train/Test Sets
print (X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(39419, 17) (39419,) (39418, 17) (39418,)


In [58]:
import warnings
warnings.filterwarnings('always') 
# Import The Necessary Libraries For LogisticRegression.
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Creating an Instance of The LogisticRegression Class
log_reg = LogisticRegression(solver='lbfgs', max_iter=10000)

# Fitting The Model on The Training Data
log_reg.fit(X_train, y_train)

# Predicting The Target Variable For The Test Data
y_pred = log_reg.predict(X_test)

# Calculating The Performance Metrics of The Model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
recall = recall_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
f1_score2 = f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))

# Printing The Performance Metrics
print('Logistic Regression Model Performance:')
print('Accuracy:', accuracy)
print ('Accuracy-Score :{:.2f}'.format(accuracy_score(y_test, y_pred)*100))
print('Precision:', precision)
print ('Precision-Score :{:.2f}'.format(precision_score(y_test, y_pred,average='weighted', labels=np.unique(y_pred))*100))
print('Recall:', recall)
print ('Recall-Score :{:.2f}'.format(recall_score(y_test, y_pred,average='weighted', labels=np.unique(y_pred))*100))
print('F1-Score:', f1_score2)
print ('F1-Score :{:.2f}'.format(f1_score(y_test, y_pred,average='weighted', labels=np.unique(y_pred))*100))

Logistic Regression Model Performance:
Accuracy: 0.30734689735653764
Accuracy-Score :30.73
Precision: 0.4243332572378631
Precision-Score :42.43
Recall: 0.39094517409403334
Recall-Score :39.09
F1-Score: 0.27602348273663097
F1-Score :27.60


In [59]:
import warnings
warnings.filterwarnings('always') 
# Import The Necessary Libraries For DecisionTreeClassifier.
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Creating an Instance of The DecisionTreeClassifier Class
dtc = DecisionTreeClassifier()

# Fitting The Model on The Training Data
dtc.fit(X_train, y_train)

# Predicting The Target Variable For The Test Data
y_pred = dtc.predict(X_test)

# Calculating The Performance Metrics of The Model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
recall = recall_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
f1_score2 = f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))

# Printing The Performance Metrics
print('Decision Tree Classifier:')
print('Accuracy:', accuracy)
print ('Accuracy-Score :{:.2f}'.format(accuracy_score(y_test, y_pred)*100))
print('Precision:', precision)
print ('Precision-Score :{:.2f}'.format(precision_score(y_test, y_pred,average='weighted', labels=np.unique(y_pred))*100))
print('Recall:', recall)
print ('Recall-Score :{:.2f}'.format(recall_score(y_test, y_pred,average='weighted', labels=np.unique(y_pred))*100))
print('F1-Score:', f1_score2)
print ('F1-Score :{:.2f}'.format(f1_score(y_test, y_pred,average='weighted', labels=np.unique(y_pred))*100))


Decision Tree Classifier:
Accuracy: 0.5863564868841646
Accuracy-Score :58.64
Precision: 0.6402839277942317
Precision-Score :64.03
Recall: 0.5863564868841646
Recall-Score :58.64
F1-Score: 0.5990416931440571
F1-Score :59.90


In [63]:
import warnings
warnings.filterwarnings('always') 
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Creating an Instance of The GaussianNB Class
nb = GaussianNB()

# fitting the model on the training data
nb.fit(X_train, y_train)

# predicting the target variable for the test data
y_pred = nb.predict(X_test)

# Calculating The Performance Metrics of The Model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
recall = recall_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
f1_score2 = f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))

# Printing The Performance Metrics
print('Gaussian NB Class:')
print('Accuracy:', accuracy)
print ('Accuracy-Score :{:.2f}'.format(accuracy_score(y_test, y_pred)*100))
print('Precision:', precision)
print ('Precision-Score :{:.2f}'.format(precision_score(y_test, y_pred,average='weighted', labels=np.unique(y_pred))*100))
print('Recall:', recall)
print ('Recall-Score :{:.2f}'.format(recall_score(y_test, y_pred,average='weighted', labels=np.unique(y_pred))*100))
print('F1-Score:', f1_score2)
print ('F1-Score :{:.2f}'.format(f1_score(y_test, y_pred,average='weighted', labels=np.unique(y_pred))*100))

Gaussian NB Class:
Accuracy: 0.2945862296412806
Accuracy-Score :29.46
Precision: 0.2564367373920003
Precision-Score :25.64
Recall: 0.37471360805447096
Recall-Score :37.47
F1-Score: 0.22145014455784193
F1-Score :22.15
