## # Logistic Regression - Iris Dataset, Random Dataset

### Step-01: Import all the libraries required to perfom logistic regression

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

### Step-02: Import load_iris from sklearn.datasets

In [2]:
from sklearn.datasets import load_iris

In [3]:
dataset = load_iris()

In [4]:
dataset.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [5]:
print(dataset.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [6]:
df = pd.DataFrame(dataset.data, columns = dataset.feature_names)

In [7]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [8]:
df["target"] = dataset.target

In [9]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [10]:
df = df[df["target"] != 2]

In [11]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [12]:
x = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [13]:
x.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [14]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: target, dtype: int64

### Step-03: Import train_test_split from sklearn.model_selection

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

In [17]:
print(x_train.head())
print(x_train.shape)

    sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
55                5.7               2.8                4.5               1.3
88                5.6               3.0                4.1               1.3
26                5.0               3.4                1.6               0.4
42                4.4               3.2                1.3               0.2
69                5.6               2.5                3.9               1.1
(80, 4)


In [18]:
print(x_test.head())
print(x_test.shape)

    sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
83                6.0               2.7                5.1               1.6
53                5.5               2.3                4.0               1.3
70                5.9               3.2                4.8               1.8
45                4.8               3.0                1.4               0.3
44                5.1               3.8                1.9               0.4
(20, 4)


In [19]:
print(y_train.head())
print(y_train.shape)

55    1
88    1
26    0
42    0
69    1
Name: target, dtype: int64
(80,)


In [20]:
print(y_test.head())
print(y_test.shape)

83    1
53    1
70    1
45    0
44    0
Name: target, dtype: int64
(20,)


### Step-04: Import LogisticRegression from sklearn.linear_model

In [21]:
from sklearn.linear_model import LogisticRegression

In [22]:
classifier = LogisticRegression(max_iter = 200)

In [23]:
classifier

In [24]:
classifier.fit(x_train, y_train)

In [25]:
y_pred = classifier.predict(x_test)

In [26]:
y_pred

array([1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0])

### Step-05: Import confusion_matrix, accuracy_score, classification_report from sklearn.metrics

In [27]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [28]:
print("Confusion Matrix: \n {}".format(confusion_matrix(y_test, y_pred)))
print("Accuracy Score: \n {}".format(accuracy_score(y_test, y_pred)))
print("Classification Report: \n {}".format(classification_report(y_test, y_pred)))

Confusion Matrix: 
 [[12  0]
 [ 0  8]]
Accuracy Score: 
 1.0
Classification Report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      1.00      1.00         8

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20



### Step-06: Import KFold form sklearn.model_selection

In [29]:
from sklearn.model_selection import KFold

In [30]:
cv = KFold(n_splits = 5, shuffle = True)

In [31]:
cv

KFold(n_splits=5, random_state=None, shuffle=True)

### Step-07: Import cross_val_score from sklearn.model_selection 

In [32]:
from sklearn.model_selection import cross_val_score

In [33]:
cross_val_score(LogisticRegression(), x_test, y_test, scoring = "accuracy", cv = cv)

array([1., 1., 1., 1., 1.])

### Step-08: Import make_classification from sklearn.datasets

In [34]:
from sklearn.datasets import make_classification

In [35]:
x,y = make_classification(n_samples = 1000, n_features = 10, n_informative = 5, n_redundant = 5, n_classes = 2, random_state = 42) 

### Step-09: Import train_test_split from sklearn.model_selection

In [36]:
from sklearn.model_selection import train_test_split

In [37]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

### Step-10: Import LogisticRegression from sklearn.linear_model

In [38]:
from sklearn.linear_model import LogisticRegression

In [39]:
classifier = LogisticRegression(max_iter = 200)

In [40]:
classifier

In [41]:
classifier.fit(x_train, y_train)

In [42]:
y_pred = classifier.predict(x_test)

In [43]:
y_pred

array([0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1,
       1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1,
       1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0,
       1, 1])

In [44]:
y_pred_proba = classifier.predict_proba(x_test)

### Step-11: Import confusion_matrix, accuracy_score, classification_report from sklearn.metrics

In [45]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [46]:
print("Confusion Matrix: \n {}".format(confusion_matrix(y_test, y_pred)))
print("Accuracy Score: \n {}".format(accuracy_score(y_test, y_pred)))
print("Classification Report: \n {}".format(classification_report(y_test, y_pred)))

Confusion Matrix: 
 [[81 19]
 [23 77]]
Accuracy Score: 
 0.79
Classification Report: 
               precision    recall  f1-score   support

           0       0.78      0.81      0.79       100
           1       0.80      0.77      0.79       100

    accuracy                           0.79       200
   macro avg       0.79      0.79      0.79       200
weighted avg       0.79      0.79      0.79       200



### Step-12: Import KFold form sklearn.model_selection

In [47]:
from sklearn.model_selection import KFold

In [48]:
cv = KFold(n_splits = 5, shuffle = True)

In [49]:
cv

KFold(n_splits=5, random_state=None, shuffle=True)

### Step-13: Import cross_val_score from sklearn.model_selection 

In [50]:
from sklearn.model_selection import cross_val_score

In [51]:
scores = cross_val_score(LogisticRegression(), x_test, y_test, scoring = "accuracy", cv = cv)

In [52]:
scores

array([0.775, 0.675, 0.9  , 0.8  , 0.825])

In [53]:
np.mean(scores)

0.7950000000000002

### Step-14: Import roc_auc_score from sklearn.metrics

In [54]:
from sklearn.metrics import roc_auc_score

In [55]:
print("Accuracy: {}" .format(roc_auc_score(y_test, y_pred)))

Accuracy: 0.7900000000000001
