In [1]:
# Import standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Reading the CSV
col_header = ['price','maintenance','no_doors','persons','lug_boot_size','safety','class_value']
df = pd.read_csv('Original_data\car.data', header=None, names=col_header)
df.head()

Unnamed: 0,price,maintenance,no_doors,persons,lug_boot_size,safety,class_value
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [3]:
# Checking shape
df.shape

(1728, 7)

In [4]:
# Checking for missing values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   price          1728 non-null   object
 1   maintenance    1728 non-null   object
 2   no_doors       1728 non-null   object
 3   persons        1728 non-null   object
 4   lug_boot_size  1728 non-null   object
 5   safety         1728 non-null   object
 6   class_value    1728 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


In [5]:
# Checking for missing values
df.isnull().sum()

price            0
maintenance      0
no_doors         0
persons          0
lug_boot_size    0
safety           0
class_value      0
dtype: int64

In [6]:
# Checking the distribution of data
print(df.price.value_counts())
print(df.maintenance.value_counts())
print(df.no_doors.value_counts())
print(df.persons.value_counts())
print(df.lug_boot_size.value_counts())
print(df.safety.value_counts())
print(df.class_value.value_counts())

high     432
med      432
vhigh    432
low      432
Name: price, dtype: int64
high     432
med      432
vhigh    432
low      432
Name: maintenance, dtype: int64
4        432
3        432
2        432
5more    432
Name: no_doors, dtype: int64
more    576
4       576
2       576
Name: persons, dtype: int64
big      576
small    576
med      576
Name: lug_boot_size, dtype: int64
high    576
med     576
low     576
Name: safety, dtype: int64
unacc    1210
acc       384
good       69
vgood      65
Name: class_value, dtype: int64


In [7]:
# From the question, person is not a feature for the prediction, dropping the column
df.drop('persons', inplace=True, axis=1)
df.head()

Unnamed: 0,price,maintenance,no_doors,lug_boot_size,safety,class_value
0,vhigh,vhigh,2,small,low,unacc
1,vhigh,vhigh,2,small,med,unacc
2,vhigh,vhigh,2,small,high,unacc
3,vhigh,vhigh,2,med,low,unacc
4,vhigh,vhigh,2,med,med,unacc


In [8]:
# Encoding the price data
df.replace({'price':{'vhigh':4, 'high':3, 'med':2, 'low':1}}, inplace=True)
df.head()

Unnamed: 0,price,maintenance,no_doors,lug_boot_size,safety,class_value
0,4,vhigh,2,small,low,unacc
1,4,vhigh,2,small,med,unacc
2,4,vhigh,2,small,high,unacc
3,4,vhigh,2,med,low,unacc
4,4,vhigh,2,med,med,unacc


In [9]:
# Import SKlearn
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

In [10]:
# Splitting data into train and test set 8:2
X = df.drop(['price'], axis = 1)
Y = df.price
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=2)

enc = OrdinalEncoder()
enc.fit(X_train)

X_train = enc.transform(X_train)
X_test = enc.transform(X_test)

# Reference for what the numbers in the transformed array represent
enc.categories_

[array(['high', 'low', 'med', 'vhigh'], dtype=object),
 array(['2', '3', '4', '5more'], dtype=object),
 array(['big', 'med', 'small'], dtype=object),
 array(['high', 'low', 'med'], dtype=object),
 array(['acc', 'good', 'unacc', 'vgood'], dtype=object)]

In [11]:
print(X_train)

[[0. 2. 1. 0. 2.]
 [2. 1. 1. 1. 2.]
 [2. 2. 0. 0. 0.]
 ...
 [3. 2. 0. 2. 2.]
 [3. 3. 1. 0. 2.]
 [1. 0. 1. 2. 2.]]


In [12]:
print(Y)

0       4
1       4
2       4
3       4
4       4
       ..
1723    1
1724    1
1725    1
1726    1
1727    1
Name: price, Length: 1728, dtype: int64


# Section 5 question

In [13]:
X_test_qn = [['high','4','big','high','good']]
X_test_qn_enc = enc.transform(X_test_qn)
X_test_qn_enc

array([[0., 2., 0., 0., 1.]])

# Log Reg Model

In [14]:
# Log Reg Model
log_reg_model = LogisticRegression()
log_reg_model.fit(X_train,Y_train)
y_pred = log_reg_model.predict(X_test)
accuracy_score(Y_test, y_pred)

0.20520231213872833

In [15]:
log_reg_model.predict(X_test_qn_enc)

array([3], dtype=int64)

# Random Forest

In [16]:
# Random Forest Model
rf = RandomForestClassifier(random_state=64)
rf.fit(X_train, Y_train)
y_pred = rf.predict(X_test)
accuracy_score(Y_test, y_pred)

0.08092485549132948

In [17]:
rf.predict(X_test_qn_enc)

array([2], dtype=int64)

# Support Vector

In [18]:
# SV Model
from sklearn.svm import LinearSVC
lsvc = LinearSVC(random_state=42)
lsvc.fit(X_train, Y_train)
y_pred = lsvc.predict(X_test)
accuracy_score(Y_test, y_pred)

0.21676300578034682

In [19]:
lsvc.predict(X_test_qn_enc)

array([3], dtype=int64)

# Results don't look too good, let's try some Feature Selection

In [20]:
# Reading the CSV
col_header = ['price','maintenance','no_doors','persons','lug_boot_size','safety','class_value']
df_fs = pd.read_csv('Original_data\car.data', header=None, names=col_header)
df_fs.head()

Unnamed: 0,price,maintenance,no_doors,persons,lug_boot_size,safety,class_value
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [21]:
# Encoding the price data
df_fs.replace({'price':{'vhigh':4, 'high':3, 'med':2, 'low':1}}, inplace=True)

# Encoding the maintenance data
df_fs.replace({'maintenance':{'vhigh':4, 'high':3, 'med':2, 'low':1}}, inplace=True)

# Encoding the no_doors data
df_fs.replace({'no_doors':{'5more':5}}, inplace=True)

# Encoding the lug_boot_size data
df_fs.replace({'lug_boot_size':{'big':3, 'med':2, 'small':1}}, inplace=True)

# Encoding the safety data
df_fs.replace({'safety':{'high':3, 'med':2, 'low':1}}, inplace=True)

# Encoding the class_value data
df_fs.replace({'class_value':{'vgood':4, 'good':3, 'acc':2, 'unacc':1}}, inplace=True)

df_fs.head()

Unnamed: 0,price,maintenance,no_doors,persons,lug_boot_size,safety,class_value
0,4,4,2,2,1,1,1
1,4,4,2,2,1,2,1
2,4,4,2,2,1,3,1
3,4,4,2,2,2,1,1
4,4,4,2,2,2,2,1


In [22]:
# Looks like class_value is the better feature for prediction
df_fs.corr()

Unnamed: 0,price,maintenance,lug_boot_size,safety,class_value
price,1.0,0.0,0.0,0.0,-0.28275
maintenance,0.0,1.0,0.0,0.0,-0.232422
lug_boot_size,0.0,0.0,1.0,0.0,0.157932
safety,0.0,0.0,0.0,1.0,0.439337
class_value,-0.28275,-0.232422,0.157932,0.439337,1.0


In [23]:
# Splitting data into train and test set 8:2
X_fs = df_fs.drop(['price', 'maintenance', 'lug_boot_size', 'persons', 'no_doors'], axis=1)
Y = df_fs.price
X_train_fs, X_test_fs, Y_train, Y_test = train_test_split(X_fs, Y, test_size = 0.2, random_state=2)

In [24]:
X_train_fs

Unnamed: 0,safety,class_value
599,3,1
678,1,1
728,3,2
1043,3,2
787,2,1
...,...,...
1558,2,2
1608,1,1
493,2,1
527,3,1


# New test question parameters with selected features

In [25]:
enc.fit(X_train_fs)
X_test_qn_fs = [[3,3]]

# Log Reg Model w FS

In [26]:
# Log Reg Model
log_reg_model = LogisticRegression()
log_reg_model.fit(X_train_fs,Y_train)
y_pred = log_reg_model.predict(X_test_fs)
accuracy_score(Y_test, y_pred)

0.3236994219653179

In [27]:
log_reg_model.predict(X_test_qn_fs)

array([1], dtype=int64)

# Random Forest w FS

In [28]:
# Random Forest Model
rf = RandomForestClassifier(random_state=64)
rf.fit(X_train_fs, Y_train)
y_pred = rf.predict(X_test_fs)
accuracy_score(Y_test, y_pred)

0.315028901734104

In [29]:
rf.predict(X_test_qn_fs)

array([1], dtype=int64)

# Support Vector w FS

In [30]:
# SV Model
from sklearn.svm import LinearSVC
lsvc = LinearSVC(random_state=42)
lsvc.fit(X_train_fs, Y_train)
y_pred = lsvc.predict(X_test_fs)
accuracy_score(Y_test, y_pred)

0.3092485549132948

In [31]:
lsvc.predict(X_test_qn_fs)

array([1], dtype=int64)

# Final Summary: To answer the question...
### By just selecting the class_value and safety feature, accuracy boosted up to 30%-32% for the classifier models

### Each model also predicted LOW price for the parameters set for the question.