# Logistic Regression Model

In [1]:
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
data = pd.read_csv('datasets/magic04.data', 
                   names=['fLength', 'fWidth', 'fSize', 'fConc', 'fConc1', 'fAsym', 'fM3Long', 'fM3Trans', 'fAlpha', 'fDist', 'class'])

data.head()

Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,fM3Long,fM3Trans,fAlpha,fDist,class
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.011,-8.2027,40.092,81.8828,g
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.261,g
2,162.052,136.031,4.0612,0.0374,0.0187,116.741,-64.858,-45.216,76.96,256.788,g
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.449,116.737,g
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.648,356.462,g


In [3]:
data['class'] = (data['class'] == 'g').astype(int)
data['class'].unique()

array([1, 0])

In [4]:
X = data.drop('class', axis=1)
y = data['class']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = X_scaled
X.shape

(19020, 10)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

##### Hyperparameter tuning

In [6]:
lg_model = LogisticRegression()

param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.1, 0.5, 1.0, 2.0, 5.0],
    'solver': ['liblinear', 'lbfgs', 'saga']
}

grid_search = GridSearchCV(lg_model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

25 fits failed out of a total of 150.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "A:\Applications\Anaconda\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "A:\Applications\Anaconda\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "A:\Applications\Anaconda\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

 0.79054909        nan 0.790

In [7]:
best_params = grid_search.best_params_
best_logreg = grid_search.best_estimator_
print(best_params, '\n', best_logreg)

accuracy = best_logreg.score(X_test, y_test)
print('Accuracy - ', accuracy)

{'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'} 
 LogisticRegression(C=0.1, solver='liblinear')
Accuracy -  0.7891692954784437


##### Train/Eval Function

In [8]:
def train_eval(X, y, iter_num:int):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=iter_num)
    
    lg_model = LogisticRegression(C=0.1, penalty='l2', solver='liblinear')
    lg_model.fit(X_train, y_train)
    
    y_pred = lg_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    return accuracy

In [9]:
acc = -1

for i in range(0, 25_000):
    best_acc = train_eval(X, y, i)
    
    if best_acc > acc:
        acc = best_acc
        print(f'Accurarcy {round(acc, 5)} Iter {i}')
        
    if i%1000 == 0:
        print(f'----------------------------------------------------------------------------------------> Iter {i}')

Accurarcy 0.78917 Iter 0
----------------------------------------------------------------------------------------> Iter 0
Accurarcy 0.7897 Iter 2
Accurarcy 0.79338 Iter 3
Accurarcy 0.79679 Iter 6
Accurarcy 0.79784 Iter 18
Accurarcy 0.79995 Iter 19
Accurarcy 0.80494 Iter 35
Accurarcy 0.80783 Iter 53
Accurarcy 0.81178 Iter 101
----------------------------------------------------------------------------------------> Iter 1000
----------------------------------------------------------------------------------------> Iter 2000
----------------------------------------------------------------------------------------> Iter 3000
----------------------------------------------------------------------------------------> Iter 4000
----------------------------------------------------------------------------------------> Iter 5000
----------------------------------------------------------------------------------------> Iter 6000
-------------------------------------------------------------------------