In [27]:
import pandas as pd
import io
import requests
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [28]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data"
file = requests.get(url).content
data = pd.read_csv(io.StringIO(file.decode('utf-8')), names =["buying","maint","doors","persons","lug_boot","safety", "Class"]  )

Putting the appropriate column names from the description

In [29]:
data.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,Class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [30]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
buying      1728 non-null object
maint       1728 non-null object
doors       1728 non-null object
persons     1728 non-null object
lug_boot    1728 non-null object
safety      1728 non-null object
Class       1728 non-null object
dtypes: object(7)
memory usage: 94.6+ KB


In [31]:
data['buying'].unique()

array(['vhigh', 'high', 'med', 'low'], dtype=object)

In [32]:
buying = pd.get_dummies(data['buying'], prefix = 'buying', drop_first = True)
data = pd.concat([data,buying], axis =1)

In [33]:
data['maint'].unique()

array(['vhigh', 'high', 'med', 'low'], dtype=object)

In [34]:
maint = pd.get_dummies(data['maint'], prefix = 'maint', drop_first = True)
data = pd.concat([data,maint], axis =1)

In [35]:
data['doors'].unique()

array(['2', '3', '4', '5more'], dtype=object)

In [36]:
doors = pd.get_dummies(data['doors'], prefix = 'doors', drop_first = True)
data = pd.concat([data,doors], axis =1)

In [37]:
data['persons'].unique()

array(['2', '4', 'more'], dtype=object)

In [38]:
persons = pd.get_dummies(data['persons'], prefix = 'persons', drop_first = True)
data = pd.concat([data,persons], axis =1)

In [39]:
data['lug_boot'].unique()

array(['small', 'med', 'big'], dtype=object)

In [40]:
lug_boot = pd.get_dummies(data['lug_boot'], prefix = 'lug_boot', drop_first = True)
data = pd.concat([data,lug_boot], axis =1)

In [41]:
data['safety'].unique()

array(['low', 'med', 'high'], dtype=object)

In [22]:
safety = pd.get_dummies(data['safety'], prefix = 'safety', drop_first = True)
data = pd.concat([data,safety], axis =1)

Label encoding the output class

In [42]:
le = LabelEncoder()
data['Class'] = le.fit_transform(data['Class'])

In [44]:
data['Class'].unique()

array([2, 0, 3, 1], dtype=int64)

Deleting columns for which the dummy columns were created

In [45]:
data = data.drop(["buying","maint","doors","persons","lug_boot","safety"], axis = 1)

In [46]:
data.head()

Unnamed: 0,Class,buying_low,buying_med,buying_vhigh,maint_low,maint_med,maint_vhigh,doors_3,doors_4,doors_5more,persons_4,persons_more,lug_boot_med,lug_boot_small
0,2,0,0,1,0,0,1,0,0,0,0,0,0,1
1,2,0,0,1,0,0,1,0,0,0,0,0,0,1
2,2,0,0,1,0,0,1,0,0,0,0,0,0,1
3,2,0,0,1,0,0,1,0,0,0,0,0,1,0
4,2,0,0,1,0,0,1,0,0,0,0,0,1,0


__The entire dataset has been converted into numerical values__

Now dividing the data into attributes and class ( X and y)

In [47]:
X = data.drop(['Class'], axis = 1)
X.head()

Unnamed: 0,buying_low,buying_med,buying_vhigh,maint_low,maint_med,maint_vhigh,doors_3,doors_4,doors_5more,persons_4,persons_more,lug_boot_med,lug_boot_small
0,0,0,1,0,0,1,0,0,0,0,0,0,1
1,0,0,1,0,0,1,0,0,0,0,0,0,1
2,0,0,1,0,0,1,0,0,0,0,0,0,1
3,0,0,1,0,0,1,0,0,0,0,0,1,0
4,0,0,1,0,0,1,0,0,0,0,0,1,0


In [48]:
y = data['Class']
y.head()

0    2
1    2
2    2
3    2
4    2
Name: Class, dtype: int32

Now splitting the dataset into training and test dataset

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 0, test_size = 0.3)

Penalty - l1 = lasso, l2 = ridge