In [60]:
import pandas as pd
import io
import requests
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [3]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data"
file = requests.get(url).content
data = pd.read_csv(io.StringIO(file.decode('utf-8')), names =["buying","maint","doors","persons","lug_boot","safety", "Class"]  )

Putting the appropriate column names from the description

In [4]:
data.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,Class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
buying      1728 non-null object
maint       1728 non-null object
doors       1728 non-null object
persons     1728 non-null object
lug_boot    1728 non-null object
safety      1728 non-null object
Class       1728 non-null object
dtypes: object(7)
memory usage: 94.6+ KB


In [6]:
data['buying'].unique()

array(['vhigh', 'high', 'med', 'low'], dtype=object)

In [7]:
buying = pd.get_dummies(data['buying'], prefix = 'buying', drop_first = True)
data = pd.concat([data,buying], axis =1)

In [8]:
data['maint'].unique()

array(['vhigh', 'high', 'med', 'low'], dtype=object)

In [9]:
maint = pd.get_dummies(data['maint'], prefix = 'maint', drop_first = True)
data = pd.concat([data,maint], axis =1)

In [10]:
data['doors'].unique()

array(['2', '3', '4', '5more'], dtype=object)

In [11]:
doors = pd.get_dummies(data['doors'], prefix = 'doors', drop_first = True)
data = pd.concat([data,doors], axis =1)

In [12]:
data['persons'].unique()

array(['2', '4', 'more'], dtype=object)

In [13]:
persons = pd.get_dummies(data['persons'], prefix = 'persons', drop_first = True)
data = pd.concat([data,persons], axis =1)

In [14]:
data['lug_boot'].unique()

array(['small', 'med', 'big'], dtype=object)

In [15]:
lug_boot = pd.get_dummies(data['lug_boot'], prefix = 'lug_boot', drop_first = True)
data = pd.concat([data,lug_boot], axis =1)

In [16]:
data['safety'].unique()

array(['low', 'med', 'high'], dtype=object)

In [17]:
safety = pd.get_dummies(data['safety'], prefix = 'safety', drop_first = True)
data = pd.concat([data,safety], axis =1)

Label encoding the output class

In [18]:
le = LabelEncoder()
data['Class'] = le.fit_transform(data['Class'])

In [19]:
data['Class'].unique()

array([2, 0, 3, 1], dtype=int64)

Deleting columns for which the dummy columns were created

In [20]:
data = data.drop(["buying","maint","doors","persons","lug_boot","safety"], axis = 1)

In [21]:
data.head()

Unnamed: 0,Class,buying_low,buying_med,buying_vhigh,maint_low,maint_med,maint_vhigh,doors_3,doors_4,doors_5more,persons_4,persons_more,lug_boot_med,lug_boot_small,safety_low,safety_med
0,2,0,0,1,0,0,1,0,0,0,0,0,0,1,1,0
1,2,0,0,1,0,0,1,0,0,0,0,0,0,1,0,1
2,2,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0
3,2,0,0,1,0,0,1,0,0,0,0,0,1,0,1,0
4,2,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1


__The entire dataset has been converted into numerical values__

Now dividing the data into attributes and class ( X and y)

In [22]:
X = data.drop(['Class'], axis = 1)
X.head()

Unnamed: 0,buying_low,buying_med,buying_vhigh,maint_low,maint_med,maint_vhigh,doors_3,doors_4,doors_5more,persons_4,persons_more,lug_boot_med,lug_boot_small,safety_low,safety_med
0,0,0,1,0,0,1,0,0,0,0,0,0,1,1,0
1,0,0,1,0,0,1,0,0,0,0,0,0,1,0,1
2,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0
3,0,0,1,0,0,1,0,0,0,0,0,1,0,1,0
4,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1


In [23]:
y = data['Class']
y.head()

0    2
1    2
2    2
3    2
4    2
Name: Class, dtype: int32

Now splitting the dataset into training and test dataset

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 0, test_size = 0.3)

Penalty - l1 = lasso, l2 = ridge

In [48]:
c_values = list(np.arange(1,100))

In [49]:
param_grid = [
    {
        'C' : c_values,
        'penalty' : ['l1'],
        'solver' : ['liblinear'],
        'multi_class' : ['ovr'],
        'random_state' : [42]
    },
    {
        'C' : c_values,
        'penalty' : ['l2'],
        'solver' : ['liblinear', 'newton-cg', 'lbfgs'],
        'multi_class' : ['ovr'],
        'random_state' : [42]
    }
]

In [50]:
grid = GridSearchCV(LogisticRegression(max_iter = 200),param_grid,cv = 10)

In [51]:
grid.fit(X,y)



GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=200, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid=[{'C': [1, 2, 3, 4, 5, 6, 7...
                          'multi_class': ['ovr'], 'penalty': ['l1'],
                          'random_state': [42], 'solver': ['liblinear']},
                         {'C': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
                                15, 16, 17, 18, 19, 20, 21, 22, 2

In [52]:
print (grid.best_params_)

{'C': 55, 'multi_class': 'ovr', 'penalty': 'l1', 'random_state': 42, 'solver': 'liblinear'}


In [53]:
print (grid.best_score_)

0.8385416666666666


{'C': 55, 'multi_class': 'ovr', 'penalty': 'l1', 'random_state': 42, 'solver': 'liblinear'}

In [55]:
reg = LogisticRegression(C=55, multi_class= 'ovr', penalty = 'l1', random_state=42, solver='liblinear', max_iter = 200)

In [57]:
reg.fit(X_train, y_train)

LogisticRegression(C=55, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=200,
                   multi_class='ovr', n_jobs=None, penalty='l1',
                   random_state=42, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [58]:
y_pred = reg.predict(X_test)

In [61]:
cm = confusion_matrix(y_test, y_pred)

In [62]:
print("Confusion matrix: ", cm)
print("Accuracy of the model: " ,accuracy_score(y_test, y_pred))

Confusion matrix:  [[ 89   8  18   0]
 [ 17   6   0   2]
 [ 15   0 348   0]
 [  2   0   0  14]]
Accuracy of the model:  0.8805394990366089
