In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


In [3]:
## creating dataset 
from sklearn.datasets import make_classification

In [None]:
## creating dataset {creating dataset in this way doesn't require a standardization}
X, Y = make_classification(n_samples=1000,n_features=10,n_classes=2,random_state=42)

In [5]:
## train test split
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,train_size=0.30,random_state=42)

In [6]:
## model training
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression()
logistic.fit(X_train,Y_train)
y_pred = logistic.predict(X_test)

In [7]:
# performance metrices
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
print(accuracy_score(Y_test,y_pred))
print(confusion_matrix(Y_test,y_pred))
print(classification_report(Y_test,y_pred))

0.8571428571428571
[[293  50]
 [ 50 307]]
              precision    recall  f1-score   support

           0       0.85      0.85      0.85       343
           1       0.86      0.86      0.86       357

    accuracy                           0.86       700
   macro avg       0.86      0.86      0.86       700
weighted avg       0.86      0.86      0.86       700



#### Hyperparameter tunning And Cross-Validation


In [9]:
model = LogisticRegression()
penalty = ['l1','l2','elasticnet']
c_values = [100,10,1.0,0.1,0.01]
solver = ['newton-cg','lbfgs','liblinear','sag','saga']

In [11]:
params = dict(penalty=penalty,C=c_values,solver=solver)

In [12]:
## stratifiedKfold
from sklearn.model_selection import StratifiedGroupKFold
cv = StratifiedGroupKFold()

In [18]:
## Grid Search CV
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(estimator=model,param_grid=params,scoring='accuracy',cv=cv,n_jobs=-1)

In [21]:
print(grid)

GridSearchCV(cv=StratifiedGroupKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': [100, 10, 1.0, 0.1, 0.01],
                         'penalty': ['l1', 'l2', 'elasticnet'],
                         'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag',
                                    'saga']},
             scoring='accuracy')


In [26]:
print(X_train.shape)
print(Y_train.shape)

(300, 10)
(300,)


In [27]:
grid.fit(X_train,Y_train)

TypeError: iteration over a 0-d array

In [None]:
grid.best_params_
grid.best_score_

AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

In [None]:
## same as befor
grid.predict(X_test) ## and performance matrix

#### Hyperparameter tunning with a randomizeCV

In [29]:
from sklearn.model_selection import RandomizedSearchCV
model = LogisticRegression()
randomCV = RandomizedSearchCV(estimator=model,param_distributions=params,scoring='accuracy',cv=5)

In [30]:
randomCV.fit(X_train,Y_train)

25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\StartingML\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\StartingML\venv\lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\StartingML\venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 1194, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\StartingML\venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 67, in _check_solver
    raise ValueEr

In [31]:
randomCV.best_score_

np.float64(0.8666666666666666)

In [32]:
randomCV.best_params_

{'solver': 'newton-cg', 'penalty': 'l2', 'C': 10}

In [33]:
y_pred = randomCV.predict(X_test)

In [34]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
print(accuracy_score(Y_test,y_pred))
print(confusion_matrix(Y_test,y_pred))
print(classification_report(Y_test,y_pred))

0.8571428571428571
[[293  50]
 [ 50 307]]
              precision    recall  f1-score   support

           0       0.85      0.85      0.85       343
           1       0.86      0.86      0.86       357

    accuracy                           0.86       700
   macro avg       0.86      0.86      0.86       700
weighted avg       0.86      0.86      0.86       700

