In [1]:
import sklearn
import pandas as pd 
import numpy as np 
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold

In [23]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
car_evaluation = fetch_ucirepo(id=19) 
  
# data (as pandas dataframes) 
X = car_evaluation.data.features 
y = car_evaluation.data.targets 
  
# metadata 
print(car_evaluation.metadata) 
  
# variable information 
print(car_evaluation.variables) 


{'uci_id': 19, 'name': 'Car Evaluation', 'repository_url': 'https://archive.ics.uci.edu/dataset/19/car+evaluation', 'data_url': 'https://archive.ics.uci.edu/static/public/19/data.csv', 'abstract': 'Derived from simple hierarchical decision model, this database may be useful for testing constructive induction and structure discovery methods.', 'area': 'Other', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 1728, 'num_features': 6, 'feature_types': ['Categorical'], 'demographics': [], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1988, 'last_updated': 'Thu Aug 10 2023', 'dataset_doi': '10.24432/C5JP48', 'creators': ['Marko Bohanec'], 'intro_paper': {'ID': 249, 'type': 'NATIVE', 'title': 'Knowledge acquisition and explanation for multi-attribute decision making', 'authors': 'M. Bohanec, V. Rajkovič', 'venue': '8th Intl Workshop on Expert Systems and their Applications, 

# DataSet car_evaluation

In [24]:
import sklearn
import pandas as pd 
import numpy as np 
import warnings
warnings.filterwarnings('ignore')

In [25]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1728 non-null   object
 1   maint     1728 non-null   object
 2   doors     1728 non-null   object
 3   persons   1728 non-null   object
 4   lug_boot  1728 non-null   object
 5   safety    1728 non-null   object
dtypes: object(6)
memory usage: 81.1+ KB


In [26]:
y['class'].value_counts()

class
unacc    1210
acc       384
good       69
vgood      65
Name: count, dtype: int64

In [27]:
y['class'] = np.where(y['class']=='unacc',0,1)
y['class'].value_counts()

class
0    1210
1     518
Name: count, dtype: int64

In [28]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=24,test_size=0.3,stratify=y['class'])
# in `train_test_split` use `statify='classname'` to make near about y_test, y_train to y

In [30]:
ohe = OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore').set_output(transform='pandas')
X_trn_ohe = ohe.fit_transform(X_train)
X_tst_ohe = ohe.transform(X_test)

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

In [32]:
log = LogisticRegression()
X_trn_log = log.fit(X_trn_ohe,y_train)
y_pred = log.predict(X_tst_ohe)
print(accuracy_score(y_test['class'],y_pred))     # Bcuz of using 'statify' the accuray score will increase 94% to 96%
print(confusion_matrix(y_test['class'],y_pred))

0.9633911368015414
[[355   8]
 [ 11 145]]


In [33]:
# To find parcentage of [y] : unacc, acc, good, verygood
y['class'].value_counts(normalize=True)*100

class
0    70.023148
1    29.976852
Name: proportion, dtype: float64

In [34]:
# To find parcentage of [ y_train, y_test]
print("y_train\n",y_train['class'].value_counts(normalize=True)*100)
print('------------------------------------')
print("y_test",y_test['class'].value_counts(normalize=True)*100)

y_train
 class
0    70.057899
1    29.942101
Name: proportion, dtype: float64
------------------------------------
y_test class
0    69.942197
1    30.057803
Name: proportion, dtype: float64


- Sample should represent the correctly
- in `train_test_split` use `statify='classname'` to make near about y_test, y_train to y

In [35]:
# percentage of train & test

In [36]:
# train
1210*0.7
1210*70.02/100
1210-847

363

In [37]:
# test
518*0.7
518*29.94/100
518-155

363

# Parameters:
**penalty{‘l1’, ‘l2’, ‘elasticnet’, None}, default=’l2’ :**
- Specify the norm of the penalty:
    1. None: no penalty is added;
    2. 'l2': add a L2 penalty term and it is the default choice;
    3. 'l1': add a L1 penalty term;
    4. 'elasticnet': both L1 and L2 penalty terms are added.

- **Multi-Class:**
    1. Multinominal : (also know an softmax)
    2. OVR (One v/s Rest of All)

In [40]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [41]:
# multi_class='multinomial'

ohe = OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore')
log = LogisticRegression(multi_class='multinomial')    # write multi_class='multinomial' bcuz if it not written it takes OVE (one v/s Rest of All) 
pipe = Pipeline([('OHE',ohe),('LOG',log)])
# they are parameters of Logistic Regression class so used 
params = {
    'LOG__solver':['lbfgs','liblinear','newton-cg','newton-cholesky','sag','saga']
}
gcv = GridSearchCV(pipe, param_grid=params)
gcv.fit(X,y)
print(gcv.best_params_)
print(gcv.best_score_)

{'LOG__solver': 'lbfgs'}
0.8303409566892854


In [42]:
# multi_class='ovr'
ohe = OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore')
log = LogisticRegression(multi_class='ovr')    # write multi_class='multinomial' bcuz if it not written it takes OVE (one v/s Rest of All) 
pipe = Pipeline([('OHE',ohe),('LOG',log)])
# they are parameters of Logistic Regression class so used 
params = {
    'LOG__solver':['lbfgs','liblinear','newton-cg','newton-cholesky','sag','saga']
}
gcv = GridSearchCV(pipe, param_grid=params)
gcv.fit(X,y)
print(gcv.best_params_)
print(gcv.best_score_)

{'LOG__solver': 'newton-cg'}
0.829761246544358


In [45]:
# Automatically Select best 
ohe = OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore')
log = LogisticRegression(random_state=24)    # write multi_class='multinomial' bcuz if it not written it takes OVE (one v/s Rest of All) 
kfold = StratifiedKFold(n_splits=5, random_state=24, shuffle=True)
pipe = Pipeline([('OHE',ohe),('LOG',log)])
# they are parameters of Logistic Regression class so used 
params = {
    'LOG__solver':['lbfgs','liblinear','newton-cg','newton-cholesky','sag','saga'],
    'LOG__multi_class':['ovr','multinominal'],
    'LOG__C':np.linspace(0.001,10,20)
}
gcv = GridSearchCV(pipe, param_grid=params,cv=kfold)
gcv.fit(X,y)
print(gcv.best_params_)
print(gcv.best_score_)

{'LOG__C': np.float64(2.6323157894736844), 'LOG__multi_class': 'ovr', 'LOG__solver': 'lbfgs'}
0.9502370779927956


In [46]:
pd_cv = pd.DataFrame(gcv.cv_results_)
print(pd_cv.shape)

(240, 16)
