# <font color = 'orange'> Logistic Regression With Hyperparameter Tuning

---

### <font color = 'Blue'> Create a dataset using make classification

In [1]:
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression

# define dataset
X, y = make_classification(n_samples=1000, n_features=10, n_informative=5, n_redundant=5, n_classes=2, random_state=1)

---

### <font color = 'Blue'> 1. Train Test Split

In [2]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

---

### <font color = 'Blue'> 2.1 Model Training With Logistic Regression And GridSearch Cross Validation.  
      
* Here, Our aim is to get the best trained logistic regression model by selecting best parameters of logistic regression using hyperparameter tuning.
* Hyperparameter tuning is used to find the best value of the logistic regression parameters.

### Hyperparameter tuning

In [3]:
from sklearn.model_selection import GridSearchCV
from warnings import filterwarnings

filterwarnings('ignore')

In [4]:
classifier = LogisticRegression()

In [5]:
# For learning let's will consider 2 parameter such as penalty and C parameter of logistic regression and understand the concept and play with it
# when we have a fixed values then we will use tuple 
parameters = {'penalty':('l1','l2','elasticnet'),'C': [1,10,20,30]}

In [6]:
# here internally we can apply KFold cross validation with 'cv' parameter
# here GridSearchCV helps us to select best parameter of logistic regression classifier

clf = GridSearchCV(classifier,param_grid = parameters,cv = 5)

In [7]:
# splitting of training data to train and validation
# and model training
clf.fit(x_train,y_train)

GridSearchCV(cv=5, estimator=LogisticRegression(),
             param_grid={'C': [1, 10, 20, 30],
                         'penalty': ('l1', 'l2', 'elasticnet')})

In [8]:
clf.best_params_

{'C': 1, 'penalty': 'l2'}

In [9]:
clf.best_score_

0.8087500000000001

#### <font color = '#AA00FF'> Observation :
* Here,we can see that GridSearchCV gave us the best values of c and penalty parameter. Tells that those values of parameters were giving best accuracy for the logistic regression model i.e 80%.
* Next by using this parameter we will train our model.

### Logistic regression model training after selecting best parameters using hyperparameter tuning.

In [10]:
classifier = LogisticRegression(C = 1,penalty = 'l2')

In [11]:
# training
classifier.fit(x_train,y_train)

LogisticRegression(C=1)

In [12]:
# prediction
y_pred = classifier.predict(x_test)

y_pred

array([0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1,
       0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1,
       0, 1])

In [13]:
# tells with how much probability the value is selected as 0 in above first value
# in below first list probability of getting 0 is 71% so 0 is selected above in prediction
# Here, threshold 
classifier.predict_proba(x_test)
# prediction probaility

array([[0.71649202, 0.28350798],
       [0.19508969, 0.80491031],
       [0.12418141, 0.87581859],
       [0.05045906, 0.94954094],
       [0.88775659, 0.11224341],
       [0.75067497, 0.24932503],
       [0.97980488, 0.02019512],
       [0.3921745 , 0.6078255 ],
       [0.59920135, 0.40079865],
       [0.39295203, 0.60704797],
       [0.20428696, 0.79571304],
       [0.80257879, 0.19742121],
       [0.86422932, 0.13577068],
       [0.92665682, 0.07334318],
       [0.00131743, 0.99868257],
       [0.04171096, 0.95828904],
       [0.56288536, 0.43711464],
       [0.89322764, 0.10677236],
       [0.29278211, 0.70721789],
       [0.00870994, 0.99129006],
       [0.71879454, 0.28120546],
       [0.5108267 , 0.4891733 ],
       [0.76230298, 0.23769702],
       [0.73170811, 0.26829189],
       [0.10155737, 0.89844263],
       [0.04046512, 0.95953488],
       [0.57926768, 0.42073232],
       [0.00526468, 0.99473532],
       [0.03101648, 0.96898352],
       [0.96093035, 0.03906965],
       [0.

In [14]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

print(confusion_matrix(y_test,y_pred))
print()
print(accuracy_score(y_test,y_pred))
print()
print(classification_report(y_test,y_pred))

[[78 13]
 [29 80]]

0.79

              precision    recall  f1-score   support

           0       0.73      0.86      0.79        91
           1       0.86      0.73      0.79       109

    accuracy                           0.79       200
   macro avg       0.79      0.80      0.79       200
weighted avg       0.80      0.79      0.79       200



#### <font color = '#AA00FF'> Observation :
* we got 79% accuracy after selecting best parameters.

---

### <font color = 'Blue'> 2.2 Model Training With Logistic Regression And RandomizedSearch Cross Validation.  

In [15]:
from sklearn.model_selection import RandomizedSearchCV

In [16]:
# we are find best parameters for LogisticRegression not for specific problem
random_clf = RandomizedSearchCV(LogisticRegression(),param_distributions = parameters,cv = 5, n_iter = 20)

In [17]:
# model training
random_clf.fit(x_train,y_train)

RandomizedSearchCV(cv=5, estimator=LogisticRegression(), n_iter=20,
                   param_distributions={'C': [1, 10, 20, 30],
                                        'penalty': ('l1', 'l2', 'elasticnet')})

In [18]:
# let's check the best parameter and trian the model
random_clf.best_params_

{'penalty': 'l2', 'C': 1}

In [19]:
# model creation
classifier = LogisticRegression(penalty = 'l2',C = 1)

In [20]:
# model training
classifier.fit(x_train,y_train)

LogisticRegression(C=1)

In [21]:
# prediction
y_perd = classifier.predict(x_test)

In [22]:
y_pred

array([0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1,
       0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1,
       0, 1])

In [23]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

print(confusion_matrix(y_test,y_pred))
print()
print(accuracy_score(y_test,y_pred))
print()
print(classification_report(y_test,y_pred))

[[78 13]
 [29 80]]

0.79

              precision    recall  f1-score   support

           0       0.73      0.86      0.79        91
           1       0.86      0.73      0.79       109

    accuracy                           0.79       200
   macro avg       0.79      0.80      0.79       200
weighted avg       0.80      0.79      0.79       200



#### <font color = '#AA00FF'> Observation :
* we got 79% accuracy after selecting best parameters using RandomizedSearchCV.

---

### <font color = 'Blue'> Internal assignment: 
    Use iris dataset and train the model with GridSearchCV. 

In [24]:
import pandas as pd 
import numpy as np

In [25]:
df = pd.read_csv('iris.csv')

df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [26]:
# binary classification
df_copy = df[df['target']!= 2]

df_copy

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
95,5.7,3.0,4.2,1.2,1
96,5.7,2.9,4.2,1.3,1
97,6.2,2.9,4.3,1.3,1
98,5.1,2.5,3.0,1.1,1


### Hyperparameter tuning

In [27]:
# dividing data into dependent and independent feature

x = df_copy.iloc[:,:-1]
y = df_copy.iloc[:,-1]

In [28]:
# train test split of independent and dependent data
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state = 42)

In [29]:
# hyperparater tuning
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

parameters = {'penalty':('l1','l2','elasticnet'),'C': [1,10,20,30],'solver':('lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga')}
grid_search_cv = GridSearchCV(LogisticRegression(),param_grid = parameters ,cv = 5)

In [30]:
grid_search_cv.fit(x_train,y_train)

GridSearchCV(cv=5, estimator=LogisticRegression(),
             param_grid={'C': [1, 10, 20, 30],
                         'penalty': ('l1', 'l2', 'elasticnet'),
                         'solver': ('lbfgs', 'liblinear', 'newton-cg',
                                    'newton-cholesky', 'sag', 'saga')})

In [31]:
grid_search_cv.best_params_

{'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}

### Logistic Regression with best parameters

In [32]:
# model creation
classifier = LogisticRegression()

In [33]:
# model training
classifier.fit(x_train,y_train)

LogisticRegression()

In [34]:
# prediction
y_pred = classifier.predict(x_test)

y_pred

array([1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0],
      dtype=int64)

In [35]:
# calculation of accuracy , confusion matrix and classification report 
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

print(confusion_matrix(y_test,y_pred))
print()
print(accuracy_score(y_test,y_pred))
print()
print(classification_report(y_test,y_pred))

[[12  0]
 [ 0  8]]

1.0

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      1.00      1.00         8

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20



#### <font color = '#AA00FF'> Observation :
* we got 100% accuracy which is very good after selecting best parameters using GridSearchCV.

---