In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

In [19]:
df = sns.load_dataset('iris') #iris dataset is available in seaborn library
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [20]:
df.species.unique() # This shows it is a multiclass classification since there are 3 categories.

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [21]:
df.isnull().sum()  #to find missing values

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

In [22]:
# Solving binary classification thus dropping category 'setosa'.
df = df[df['species']!= 'setosa']
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
50,7.0,3.2,4.7,1.4,versicolor
51,6.4,3.2,4.5,1.5,versicolor
52,6.9,3.1,4.9,1.5,versicolor
53,5.5,2.3,4.0,1.3,versicolor
54,6.5,2.8,4.6,1.5,versicolor


In [23]:
df['species'] = df['species'].map({'versicolor':0, 'virginica':1}) # assigning labels 0 & 1

In [24]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
50,7.0,3.2,4.7,1.4,0
51,6.4,3.2,4.5,1.5,0
52,6.9,3.1,4.9,1.5,0
53,5.5,2.3,4.0,1.3,0
54,6.5,2.8,4.6,1.5,0


In [26]:
# Split dataset into independent and dependent features
x = df.iloc[:,:-1]
y =  df.iloc[:,-1] 


In [27]:
x

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
50,7.0,3.2,4.7,1.4
51,6.4,3.2,4.5,1.5
52,6.9,3.1,4.9,1.5
53,5.5,2.3,4.0,1.3
54,6.5,2.8,4.6,1.5
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [28]:
y

50     0
51     0
52     0
53     0
54     0
      ..
145    1
146    1
147    1
148    1
149    1
Name: species, Length: 100, dtype: int64

In [29]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.25, random_state = 42)

In [30]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression() #Build the model

In [31]:
#Hyperparameter tuning using grid search- With this technique, we simply build a model for each possible combination of all of the hyperparameter values provided, evaluating each model, and selecting the architecture which produces the best results.
from sklearn.model_selection import GridSearchCV
parameter = {'penalty':['l1','l2','elasticnet'], 'C':[1,2,3,4,5,6,10,20,30,40,50], 'max_iter':[100,200,300]}

In [34]:
classifier_regressor = GridSearchCV(classifier, param_grid=parameter, scoring ='accuracy',cv=5) # scoring is always 'accuracy' for classification problems
classifier_regressor   #model                                         

GridSearchCV(cv=5, estimator=LogisticRegression(),
             param_grid={'C': [1, 2, 3, 4, 5, 6, 10, 20, 30, 40, 50],
                         'max_iter': [100, 200, 300],
                         'penalty': ['l1', 'l2', 'elasticnet']},
             scoring='accuracy')

In [35]:
classifier_regressor.fit(x_train, y_train)

Traceback (most recent call last):
  File "C:\Users\Ansela\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Ansela\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\Ansela\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "C:\Users\Ansela\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Ansela\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, 

GridSearchCV(cv=5, estimator=LogisticRegression(),
             param_grid={'C': [1, 2, 3, 4, 5, 6, 10, 20, 30, 40, 50],
                         'max_iter': [100, 200, 300],
                         'penalty': ['l1', 'l2', 'elasticnet']},
             scoring='accuracy')

In [36]:
# This indicates GridSearchCV is trained

In [38]:
# To check which parameter is trained
print(classifier_regressor.best_params_) #gives the best combination

{'C': 1, 'max_iter': 100, 'penalty': 'l2'}


In [39]:
print(classifier_regressor.best_score_) #gives the accuracy score for this combination

0.9733333333333334


In [41]:
# Prediction
y_pred = classifier_regressor.predict(x_test)

In [43]:
#Accuracy score
from sklearn.metrics import accuracy_score, classification_report

In [44]:
score = accuracy_score(y_pred, y_test)
print(score)

0.92


In [46]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.93      0.93      0.93        14
           1       0.91      0.91      0.91        11

    accuracy                           0.92        25
   macro avg       0.92      0.92      0.92        25
weighted avg       0.92      0.92      0.92        25



In [47]:
#Also high correlation indicates high accuracy
df.corr()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
sepal_length,1.0,0.553855,0.828479,0.593709,0.494305
sepal_width,0.553855,1.0,0.519802,0.566203,0.30808
petal_length,0.828479,0.519802,1.0,0.823348,0.786424
petal_width,0.593709,0.566203,0.823348,1.0,0.828129
species,0.494305,0.30808,0.786424,0.828129,1.0
