In [1]:
# Import Necessary Libraries.
import pandas as pd
import numpy as np

In [2]:
# Read The DataSet.
df = pd.read_csv('vehicles.csv')

In [3]:
# Show All Columns.
pd.set_option('display.max_columns', None)

In [4]:
# Removing Unnecessary Column From The DataSet.  
df=df.drop(['region', 'region_url', 'VIN','url','image_url','description','county'], axis=1)

In [5]:
# Removing Missing Value.
df=df.dropna()

In [6]:
# Showing The DataFrame Shape.
df.shape

(79016, 19)

In [7]:
# Showing The Info of The DataFrame.
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 79016 entries, 31 to 426836
Data columns (total 19 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            79016 non-null  int64  
 1   price         79016 non-null  int64  
 2   year          79016 non-null  float64
 3   manufacturer  79016 non-null  object 
 4   model         79016 non-null  object 
 5   condition     79016 non-null  object 
 6   cylinders     79016 non-null  object 
 7   fuel          79016 non-null  object 
 8   odometer      79016 non-null  float64
 9   title_status  79016 non-null  object 
 10  transmission  79016 non-null  object 
 11  drive         79016 non-null  object 
 12  size          79016 non-null  object 
 13  type          79016 non-null  object 
 14  paint_color   79016 non-null  object 
 15  state         79016 non-null  object 
 16  lat           79016 non-null  float64
 17  long          79016 non-null  float64
 18  posting_date  79016 non-

In [8]:
# Showing The Column-Name in The DataFrame
df.columns

Index(['id', 'price', 'year', 'manufacturer', 'model', 'condition',
       'cylinders', 'fuel', 'odometer', 'title_status', 'transmission',
       'drive', 'size', 'type', 'paint_color', 'state', 'lat', 'long',
       'posting_date'],
      dtype='object')

In [9]:
# Import Necessary Libraries For Using LabelEncoder.
from sklearn import preprocessing
# LabelEncoder Can Be Used to Normalize Labels.
le = preprocessing.LabelEncoder()

In [10]:
# Fit Label Encoder and Return Encoded Labels.
df[['size','manufacturer', 'model', 'condition','cylinders', 'fuel', 'title_status', 'transmission','drive', 'type', 'paint_color', 'state',
       'posting_date']] = df[['size','manufacturer', 'model', 'condition','cylinders', 'fuel', 'title_status', 'transmission','drive','type', 'paint_color', 'state',
       'posting_date']].apply(le.fit_transform)

In [11]:
# Transform Features By Scaling Each Feature to a Given Range.
df["odometer"] = np.sqrt(preprocessing.minmax_scale(df["odometer"]))

In [12]:
# Ensure That There are No Null Values Left in The DataFrame.
df.isnull().sum(axis = 0)

id              0
price           0
year            0
manufacturer    0
model           0
condition       0
cylinders       0
fuel            0
odometer        0
title_status    0
transmission    0
drive           0
size            0
type            0
paint_color     0
state           0
lat             0
long            0
posting_date    0
dtype: int64

In [13]:
# Showing The Info of The DataFrame and Ensure That There are no Object DType Left in The DataFrame.
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 79016 entries, 31 to 426836
Data columns (total 19 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            79016 non-null  int64  
 1   price         79016 non-null  int64  
 2   year          79016 non-null  float64
 3   manufacturer  79016 non-null  int32  
 4   model         79016 non-null  int32  
 5   condition     79016 non-null  int32  
 6   cylinders     79016 non-null  int32  
 7   fuel          79016 non-null  int32  
 8   odometer      79016 non-null  float64
 9   title_status  79016 non-null  int32  
 10  transmission  79016 non-null  int32  
 11  drive         79016 non-null  int32  
 12  size          79016 non-null  int32  
 13  type          79016 non-null  int32  
 14  paint_color   79016 non-null  int32  
 15  state         79016 non-null  int32  
 16  lat           79016 non-null  float64
 17  long          79016 non-null  float64
 18  posting_date  79016 non-

In [14]:
# Separate Features and Outcome
X = df.drop('type',axis=1).values
y = df.type.values

In [15]:
# Stratified K-Folds Cross-Validator.
# Provides Train/Test Indices To Split Data in Train/Test Sets.
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=2)
skf.get_n_splits(X, y)

for train_index, test_index in skf.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

TRAIN: [27649 28033 28223 ... 79013 79014 79015] TEST: [    0     1     2 ... 44998 45841 46455]
TRAIN: [    0     1     2 ... 44998 45841 46455] TEST: [27649 28033 28223 ... 79013 79014 79015]


In [16]:
# Showing The Shape of Train/Test Sets
print (X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(39508, 18) (39508,) (39508, 18) (39508,)


In [17]:
import warnings
warnings.filterwarnings('always') 
# Import The Necessary Libraries For LogisticRegression.
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Creating an Instance of The LogisticRegression Class
log_reg = LogisticRegression()

# Fitting The Model on The Training Data
log_reg.fit(X_train, y_train)

# Predicting The Target Variable For The Test Data
y_pred = log_reg.predict(X_test)

# Calculating The Performance Metrics of The Model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
recall = recall_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
f1_score2 = f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))

# Printing The Performance Metrics
print('Logistic Regression Model Performance:')
print('Accuracy:', accuracy)
print ('Accuracy-Score :{:.2f}'.format(accuracy_score(y_test, y_pred)*100))
print('Precision:', precision)
print ('Precision-Score :{:.2f}'.format(precision_score(y_test, y_pred,average='weighted', labels=np.unique(y_pred))*100))
print('Recall:', recall)
print ('Recall-Score :{:.2f}'.format(recall_score(y_test, y_pred,average='weighted', labels=np.unique(y_pred))*100))
print('F1-Score:', f1_score2)
print ('F1-Score :{:.2f}'.format(f1_score(y_test, y_pred,average='weighted', labels=np.unique(y_pred))*100))

Logistic Regression Model Performance:
Accuracy: 0.2755644426445277
Accuracy-Score :27.56
Precision: 0.2755644426445277
Precision-Score :27.56
Recall: 1.0
Recall-Score :100.00
F1-Score: 0.4320666732810795
F1-Score :43.21


In [18]:
import warnings
warnings.filterwarnings('always') 
# Import The Necessary Libraries For DecisionTreeClassifier.
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Creating an Instance of The DecisionTreeClassifier Class
dtc = DecisionTreeClassifier()

# Fitting The Model on The Training Data
dtc.fit(X_train, y_train)

# Predicting The Target Variable For The Test Data
y_pred = dtc.predict(X_test)

# Calculating The Performance Metrics of The Model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
recall = recall_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
f1_score2 = f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))

# Printing The Performance Metrics
print('Decision Tree Classifier:')
print('Accuracy:', accuracy)
print ('Accuracy-Score :{:.2f}'.format(accuracy_score(y_test, y_pred)*100))
print('Precision:', precision)
print ('Precision-Score :{:.2f}'.format(precision_score(y_test, y_pred,average='weighted', labels=np.unique(y_pred))*100))
print('Recall:', recall)
print ('Recall-Score :{:.2f}'.format(recall_score(y_test, y_pred,average='weighted', labels=np.unique(y_pred))*100))
print('F1-Score:', f1_score2)
print ('F1-Score :{:.2f}'.format(f1_score(y_test, y_pred,average='weighted', labels=np.unique(y_pred))*100))


Decision Tree Classifier:
Accuracy: 0.588159360129594
Accuracy-Score :58.82
Precision: 0.6491820024245857
Precision-Score :64.92
Recall: 0.588159360129594
Recall-Score :58.82
F1-Score: 0.6051089264394816
F1-Score :60.51


In [19]:
import warnings
warnings.filterwarnings('always') 
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Creating an Instance of The GaussianNB Class
nb = GaussianNB()

# fitting the model on the training data
nb.fit(X_train, y_train)

# predicting the target variable for the test data
y_pred = nb.predict(X_test)

# Calculating The Performance Metrics of The Model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
recall = recall_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
f1_score2 = f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))

# Printing The Performance Metrics
print('Gaussian NB Class:')
print('Accuracy:', accuracy)
print ('Accuracy-Score :{:.2f}'.format(accuracy_score(y_test, y_pred)*100))
print('Precision:', precision)
print ('Precision-Score :{:.2f}'.format(precision_score(y_test, y_pred,average='weighted', labels=np.unique(y_pred))*100))
print('Recall:', recall)
print ('Recall-Score :{:.2f}'.format(recall_score(y_test, y_pred,average='weighted', labels=np.unique(y_pred))*100))
print('F1-Score:', f1_score2)
print ('F1-Score :{:.2f}'.format(f1_score(y_test, y_pred,average='weighted', labels=np.unique(y_pred))*100))

Gaussian NB Class:
Accuracy: 0.2878910600384732
Accuracy-Score :28.79
Precision: 0.2482357797979103
Precision-Score :24.82
Recall: 0.36620625261598894
Recall-Score :36.62
F1-Score: 0.21706815016076209
F1-Score :21.71


: 