In [130]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
car_evaluation = fetch_ucirepo(id=19) 
  
# data (as pandas dataframes) 
X = car_evaluation.data.features 
y = car_evaluation.data.targets 
  
# metadata 
print(car_evaluation.metadata) 
  
# variable information 
print(car_evaluation.variables) 


{'uci_id': 19, 'name': 'Car Evaluation', 'repository_url': 'https://archive.ics.uci.edu/dataset/19/car+evaluation', 'data_url': 'https://archive.ics.uci.edu/static/public/19/data.csv', 'abstract': 'Derived from simple hierarchical decision model, this database may be useful for testing constructive induction and structure discovery methods.', 'area': 'Other', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 1728, 'num_features': 6, 'feature_types': ['Categorical'], 'demographics': [], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1988, 'last_updated': 'Thu Aug 10 2023', 'dataset_doi': '10.24432/C5JP48', 'creators': ['Marko Bohanec'], 'intro_paper': {'ID': 249, 'type': 'NATIVE', 'title': 'Knowledge acquisition and explanation for multi-attribute decision making', 'authors': 'M. Bohanec, V. Rajkovič', 'venue': '8th Intl Workshop on Expert Systems and their Applications, 

In [131]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
glass_identification = fetch_ucirepo(id=42) 
  
# data (as pandas dataframes) 
X = glass_identification.data.features 
y = glass_identification.data.targets 
  
# metadata 
print(glass_identification.metadata) 
  
# variable information 
print(glass_identification.variables) 


{'uci_id': 42, 'name': 'Glass Identification', 'repository_url': 'https://archive.ics.uci.edu/dataset/42/glass+identification', 'data_url': 'https://archive.ics.uci.edu/static/public/42/data.csv', 'abstract': 'From USA Forensic Science Service; 6 types of glass; defined in terms of their oxide content (i.e. Na, Fe, K, etc)', 'area': 'Physics and Chemistry', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 214, 'num_features': 9, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['Type_of_glass'], 'index_col': ['Id_number'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1987, 'last_updated': 'Thu Aug 10 2023', 'dataset_doi': '10.24432/C5WW2P', 'creators': ['B. German'], 'intro_paper': None, 'additional_info': {'summary': 'Vina conducted a comparison test of her rule-based system, BEAGLE, the nearest-neighbor algorithm, and discriminant analysis.  BEAGLE is a product available through VRS Consulting, In

In [132]:
import sklearn
import pandas as pd 
import numpy as np 
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold


In [133]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214 entries, 0 to 213
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   RI      214 non-null    float64
 1   Na      214 non-null    float64
 2   Mg      214 non-null    float64
 3   Al      214 non-null    float64
 4   Si      214 non-null    float64
 5   K       214 non-null    float64
 6   Ca      214 non-null    float64
 7   Ba      214 non-null    float64
 8   Fe      214 non-null    float64
dtypes: float64(9)
memory usage: 15.2 KB


In [134]:
y['Type_of_glass'].value_counts()

Type_of_glass
2    76
1    70
7    29
3    17
5    13
6     9
Name: count, dtype: int64

In [135]:
from sklearn.model_selection import train_test_split

In [136]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=24,test_size=0.3,stratify=y['Type_of_glass'])
# in `train_test_split` use `statify='classname'` to make near about y_test, y_train to y

In [137]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

In [138]:
log = LogisticRegression()
X_trn_log = log.fit(X_train,y_train)
y_pred = log.predict(X_test)
print(accuracy_score(y_test['Type_of_glass'],y_pred))     # Bcuz of using 'statify' the accuray score will increase 94% to 96%
print(confusion_matrix(y_test['Type_of_glass'],y_pred))

0.5846153846153846
[[16  4  1  0  0  0]
 [10 13  0  0  0  0]
 [ 3  2  0  0  0  0]
 [ 0  1  0  2  0  1]
 [ 0  2  0  0  0  1]
 [ 0  1  0  0  1  7]]


In [139]:
y['Type_of_glass'].value_counts(normalize=True)*100

Type_of_glass
2    35.514019
1    32.710280
7    13.551402
3     7.943925
5     6.074766
6     4.205607
Name: proportion, dtype: float64

In [140]:
# To find parcentage of [ y_train, y_test]
print("y_train\n",y_train['Type_of_glass'].value_counts(normalize=True)*100)
print('------------------------------------')
print("y_test",y_test['Type_of_glass'].value_counts(normalize=True)*100)

y_train
 Type_of_glass
2    35.570470
1    32.885906
7    13.422819
3     8.053691
5     6.040268
6     4.026846
Name: proportion, dtype: float64
------------------------------------
y_test Type_of_glass
2    35.384615
1    32.307692
7    13.846154
3     7.692308
5     6.153846
6     4.615385
Name: proportion, dtype: float64


In [141]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [142]:
# Automatically Select best 
log = LogisticRegression(random_state=24) 
kfold = StratifiedKFold(n_splits=5, random_state=24, shuffle=True)
# they are parameters of Logistic Regression class so used 
params = {
    'solver':['lbfgs','liblinear','newton-cg','newton-cholesky','sag','saga'],
    'multi_class':['ovr','multinominal'],
    'C':np.linspace(0.001,10,20)
}
gcv = GridSearchCV(log, param_grid=params, cv=kfold, scoring='f1_micro')
gcv.fit(X,y['Type_of_glass'])
print(gcv.best_params_)
print(gcv.best_score_)

{'C': np.float64(5.263631578947369), 'multi_class': 'ovr', 'solver': 'newton-cholesky'}
0.6499446290143964


In [143]:
pd_cv = pd.DataFrame(gcv.cv_results_)
print(pd_cv.shape)

(240, 16)


# -#####################################################-

In [144]:
# Got best model with parameter

- We are predicting on the unlabelled data : know as `Inferecing`

In [145]:
lr_best = LogisticRegression(random_state=24, C=5.263631578947369, multi_class='ovr',solver='newton-cholesky')
lr_best.fit(X,y)

# Unlablled Data

In [146]:
tst = pd.read_csv('tst_Glass.csv')        # Columns & Columns Sequence should be same

In [147]:
tst.columns

Index(['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe'], dtype='object')

In [148]:
X.columns

Index(['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe'], dtype='object')

In [149]:
y_pred_prob = lr_best.predict_proba(tst)
print(y_pred_prob.shape)

(6, 6)


In [150]:
pd_probs = pd.DataFrame(y_pred_prob,columns=['1','2','3','5','6','7'])
pd_probs

Unnamed: 0,1,2,3,5,6,7
0,3e-06,0.8331122,0.047985,0.001158,1.036627e-08,0.117743
1,0.390486,0.002295609,0.01055,2.9e-05,0.449379,0.14726
2,0.728912,4.783441e-07,9.5e-05,0.010101,1.755138e-07,0.260891
3,0.31975,1.041751e-06,3e-06,0.229408,7.29703e-13,0.450838
4,0.943702,0.01397128,0.009163,0.006288,2.52082e-11,0.026876
5,6.3e-05,0.002518133,0.001932,0.017878,3.574885e-09,0.977609


In [151]:
prediction = lr_best.predict(tst)
prediction

array([2, 6, 1, 7, 1, 7])

# Simple Way

In [152]:
best_model = gcv.best_estimator_

In [153]:
print(best_model.predict(tst))
print(best_model.predict_proba(tst))

[2 6 1 7 1 7]
[[2.70501663e-06 8.33112206e-01 4.79846285e-02 1.15776942e-03
  1.03662734e-08 1.17742680e-01]
 [3.90486231e-01 2.29560900e-03 1.05496233e-02 2.90814564e-05
  4.49379007e-01 1.47260448e-01]
 [7.28912479e-01 4.78344057e-07 9.47628751e-05 1.01007386e-02
  1.75513834e-07 2.60891366e-01]
 [3.19749514e-01 1.04175141e-06 2.86788778e-06 2.29408495e-01
  7.29702980e-13 4.50838081e-01]
 [9.43701788e-01 1.39712824e-02 9.16336853e-03 6.28781228e-03
  2.52082009e-11 2.68757485e-02]
 [6.26326454e-05 2.51813302e-03 1.93183565e-03 1.78784531e-02
  3.57488540e-09 9.77608942e-01]]
