In [2]:
import sklearn
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import warnings

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.model_selection import StratifiedKFold

warnings.filterwarnings('ignore')

In [43]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
glass_identification = fetch_ucirepo(id=42) 
  
# data (as pandas dataframes) 
X = glass_identification.data.features 
y = glass_identification.data.targets 
  

In [44]:

x_train, x_test, y_train, y_test = train_test_split(X, y, 
                                                    random_state = 24, 
                                                    test_size = 0.3,
                                                    stratify = y['Type_of_glass'])


In [51]:


kfold = StratifiedKFold(n_splits = 5, 
                        random_state = 24,
                        shuffle = True)

lr = LogisticRegression(random_state = 24)
pipe = Pipeline([("LR", lr)])
params = {"LR__solver" : ["liblinear", "lbfgs", "newton-cg", "newton-cholesky", "sag", "saga"],
          "LR__multi_class" : ['ovr', 'multinomial'],
          "LR__C" : np.linspace(0.0001, 10, 20)}
gcv = GridSearchCV(pipe, param_grid = params, cv = kfold)
gcv.fit(X, y["Type_of_glass"])
print(gcv.best_params_)
print(gcv.best_score_)




{'LR__C': 5.263205263157895, 'LR__multi_class': 'ovr', 'LR__solver': 'newton-cg'}
0.6499446290143964


In [61]:
lr_best = LogisticRegression(random_state = 24,
                             C = 5.263205263157895,
                             multi_class = 'ovr',
                             solver = 'newton-cg',)

lr_best.fit(X, y)

In [53]:
tst = pd.read_csv("tst_Glass.csv")

In [55]:
tst.columns

Index(['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe'], dtype='object')

In [57]:
X.columns

Index(['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe'], dtype='object')

In [None]:
# we are going to do the predicting on unlabelled data
# Inferencing => Predicting on the unlabellled data
# -> The process of inferencing runs of device of user
# -> The process of inferencing should be done in minimal time and with minimal possible computational resources.


In [79]:
# starting inferencing
y_pred_proba = lr_best.predict_proba(tst)
pd.DataFrame(y_pred_proba, columns = ['1', '2', '3', '5', '6',  '7'])

Unnamed: 0,1,2,3,5,6,7
0,3e-06,0.840255,0.025073,0.000983,1.205383e-05,0.133674
1,0.427895,0.004844,0.016435,3.6e-05,0.3891674,0.161622
2,0.724143,9e-06,0.000263,0.015907,1.164498e-10,0.259677
3,0.308442,1e-05,6e-06,0.258139,8.655231e-15,0.433404
4,0.941478,0.016759,0.008552,0.006424,1.045093e-10,0.026787
5,6.3e-05,0.006345,0.00238,0.020517,2.012212e-09,0.970694


In [83]:
predictions = lr_best.predict(tst)
predictions

array([2, 1, 1, 7, 1, 7], dtype=int64)

In [85]:
# Predict_proba gives probabilities 
# Predict_proba function is not present inside the Regression type models.

In [87]:
# _____________________________________________
# One simpler way to do above activities.
# _____________________________________________

In [93]:
best_model = gcv.best_estimator_
best_model.predict(tst)


array([2, 1, 1, 7, 1, 7], dtype=int64)