In [1]:
import numpy as np
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


# ignore warnings
import warnings
warnings.filterwarnings("ignore")

#### Logistic Regression
- Slides: [Link](https://docs.google.com/presentation/d/1uK_PLp_gjowSTUEIhPyJrniuHFXN4waR/edit?usp=sharing&ouid=117293316284429745505&rtpof=true&sd=true)
- logistic regression in sklearn

In [2]:
import acquire
import prepare

In [3]:
iris = acquire.get_iris_data()
iris.head()

Unnamed: 0,species_id,species_name,sepal_length,sepal_width,petal_length,petal_width
0,1,setosa,5.1,3.5,1.4,0.2
1,1,setosa,4.9,3.0,1.4,0.2
2,1,setosa,4.7,3.2,1.3,0.2
3,1,setosa,4.6,3.1,1.5,0.2
4,1,setosa,5.0,3.6,1.4,0.2


In [4]:
train, val, test = prepare.prep_iris(iris)
train.head()

Unnamed: 0,species,sepal_length,sepal_width,petal_length,petal_width,setosa,versicolor,virginica
79,versicolor,5.7,2.6,3.5,1.0,0,1,0
36,setosa,5.5,3.5,1.3,0.2,1,0,0
133,virginica,6.3,2.8,5.1,1.5,0,0,1
95,versicolor,5.7,3.0,4.2,1.2,0,1,0
18,setosa,5.7,3.8,1.7,0.3,1,0,0


## Logistic Regression works best with binary classification

### For demo, we will predict if species is virginica or not

In [5]:
X_train = train.drop(columns=['species', 'setosa', 'versicolor', 'virginica'])
X_val = val.drop(columns=['species', 'setosa', 'versicolor', 'virginica'])
X_test = test.drop(columns=['species', 'setosa', 'versicolor', 'virginica'])

y_train = train.virginica
y_val = val.virginica
y_test = test.virginica

X_train.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
79,5.7,2.6,3.5,1.0
36,5.5,3.5,1.3,0.2
133,6.3,2.8,5.1,1.5
95,5.7,3.0,4.2,1.2
18,5.7,3.8,1.7,0.3


# Model 1

In [6]:
# Define the logistic regression model
logit = LogisticRegression(C=1, random_state=13)

In [7]:
#  fit the model on train data
logit.fit(X_train, y_train)

LogisticRegression(C=1, random_state=13)

In [8]:
# now use the model to make predictions
y_pred = logit.predict(X_train)

In [9]:
#take a look at predictions
y_pred

array([0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0], dtype=uint8)

In [10]:
# look at predicted probabilites for first 10 observations
logit.predict_proba(X_train)[:10]

array([[9.90352053e-01, 9.64794739e-03],
       [9.99992205e-01, 7.79478028e-06],
       [4.27324947e-01, 5.72675053e-01],
       [9.42448583e-01, 5.75514173e-02],
       [9.99976450e-01, 2.35496548e-05],
       [9.53925641e-02, 9.04607436e-01],
       [9.99980041e-01, 1.99587144e-05],
       [9.99982492e-01, 1.75077924e-05],
       [9.99992613e-01, 7.38745727e-06],
       [1.16102045e-01, 8.83897955e-01]])

In [11]:
logit.classes_

array([0, 1], dtype=uint8)

In [12]:
# View raw probabilities (output from the model)

y_pred_proba = logit.predict_proba(X_train)
y_pred_proba = pd.DataFrame(y_pred_proba, columns = ['non-virginca', 'virginica'])
y_pred_proba.head().round(3)

Unnamed: 0,non-virginca,virginica
0,0.99,0.01
1,1.0,0.0
2,0.427,0.573
3,0.942,0.058
4,1.0,0.0


In [13]:
# classification report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.96      0.98        56
           1       0.93      1.00      0.97        28

    accuracy                           0.98        84
   macro avg       0.97      0.98      0.97        84
weighted avg       0.98      0.98      0.98        84



## Model 2

In [14]:
# Change hyperparameter C = 0.01

logit2 = LogisticRegression(C=.01 ,random_state=13)

In [15]:
# fit the model
logit2.fit(X_train, y_train)

LogisticRegression(C=0.01, random_state=13)

In [16]:
# make prediction
y_pred2 = logit2.predict(X_train)

In [17]:
#classification report
print(classification_report(y_train, y_pred2))

              precision    recall  f1-score   support

           0       0.76      1.00      0.86        56
           1       1.00      0.36      0.53        28

    accuracy                           0.79        84
   macro avg       0.88      0.68      0.69        84
weighted avg       0.84      0.79      0.75        84



## Evaluate Model 1 and 2 performance on 'Validate'

In [18]:
# Make prediction for validate dataset

y_pred_val = logit.predict(X_val)
y_pred_val2 = logit2.predict(X_val)

In [19]:
print("Model 1: solver = lbfgs, c = 1")

print('Accuracy: {:.2f}'.format(logit.score(X_val, y_val)))

print(classification_report(y_val, y_pred_val))

print('--------------------------------------------------')

print("Model 2: solver = lbfgs, c = .01")

print('Accuracy: {:.2f}'.format(logit2.score(X_val, y_val)))

print(classification_report(y_val, y_pred_val2))

Model 1: solver = lbfgs, c = 1
Accuracy: 0.97
              precision    recall  f1-score   support

           0       0.96      1.00      0.98        24
           1       1.00      0.92      0.96        12

    accuracy                           0.97        36
   macro avg       0.98      0.96      0.97        36
weighted avg       0.97      0.97      0.97        36

--------------------------------------------------
Model 2: solver = lbfgs, c = .01
Accuracy: 0.83
              precision    recall  f1-score   support

           0       0.80      1.00      0.89        24
           1       1.00      0.50      0.67        12

    accuracy                           0.83        36
   macro avg       0.90      0.75      0.78        36
weighted avg       0.87      0.83      0.81        36



### Notable Hyperparameters
#### Penalty:
- This argument applies a technique called "regularization"
- Determines which penalty technique is applied
- Keep model simple
- Constraints the coefficients
- Discourages learning more complex model
- Minimizes overfitting
- Options:
    - L1 - Lasso
        - Use when:
            - Feature selection is important: Lasso has a built-in feature selection property, meaning it can drive some coefficients exactly to zero. This is useful when you want to identify the most relevant features and simplify the model.
            - You suspect that many of your features are irrelevant or redundant: Lasso can effectively eliminate irrelevant features, reducing the model's complexity and improving interpretability.
            - You have a limited number of observations or a high-dimensional dataset: Lasso's ability to perform feature selection can be particularly advantageous in situations where the number of features is large compared to the number of observations.
    - **Default:** L2 - Ridge
        - Use when:
            - You want to shrink the coefficients towards zero without completely eliminating any of them: Ridge regularization reduces the magnitude of all coefficients, but it does not force them to exactly zero. This can be beneficial when you believe all features have some level of importance in predicting the outcome.
            - Your primary goal is to improve the model's predictive performance: Ridge regularization tends to produce more stable models and can be effective in reducing overfitting, especially when there are multiple correlated features.
            - You are not concerned about feature selection and interpretability: Ridge regularization does not explicitly perform feature selection, so it may not be suitable if you need a model with a reduced set of features for interpretability purposes.
    - elasticnet - both L1 and L2
        - Use when:
            - You want a balanced approach of both
    - None

#### C = Inverse of regularization strength:

- Controls the balance between model complexity and generalization
- Lower C
    - higher regularization
    - discourages learning more complex model
    - prioritize simplicity and generalization
    - decreases the chance of overfitting
- Higher C
    - means we want to fit the training data as closely as possible
    - more flexible and potential to capture more intricate patterns in the data
    - increases the danger of overfitting
- best to start with powers of ten (i.e. .001, .01, .1, 1, 10, 100, 1000) since this is a logarithmic based algorithm

## Bonus: Interpreting model coefficients

In [20]:
# look at model 1 coefficents
 
print('Coefficient: \n', logit.coef_[0])

Coefficient: 
 [ 0.5651314  -0.44179673  2.38139294  1.72628169]


#### Logistic Regression basics:

log(odds) = log(p/(1-p)) = $intercept$ + ($\beta_1$ * variable1) + ($\beta_2$ * variable2) + ($\beta_3$ * variable3)

**The coefficients above represents 'log odds'**

In [21]:
# Make a dataframe of coefficients and feature names

log_coeffs = pd.DataFrame(logit.coef_[0], index = X_train.columns,
                          columns = ['coeffs']).sort_values(by = 'coeffs', ascending = True)
log_coeffs

Unnamed: 0,coeffs
sepal_width,-0.441797
sepal_length,0.565131
petal_width,1.726282
petal_length,2.381393


**It would be helpful to convert 'log odds' to 'odds'**

In [22]:
# convert from log odds to odds (exponentiate)
odds = np.exp(log_coeffs)
odds

Unnamed: 0,coeffs
sepal_width,0.64288
sepal_length,1.759679
petal_width,5.619719
petal_length,10.819964


### What is odds?

odds = P(occurring) / P(not occurring)  = p / (1-p)

Toss a fair coin
odds = 0.5 / (1-0.5) = 1   i.e. Odd of landing tails vs heads is 1:1 for fair coin

Rolling 2 or higher on a dice roll  
odd = (5/6) /  (1/6) = 5 i.e. Odd of rolling a 2 or higher on a dice is 5:1 for a fair die

#### Coefficient Interpretation (odds):


- **Example: petal_length: For every one unit increase in petal_length, we expect 10 times increase in odds of being a 'virginica' vs a 'non-virginica'.**


- **If the coefficient (odds) is 1 or close to 1, this means odds of being in class '1' (positive class) is same or close to being in class '0' (negative class). This means the feature with this coefficient is not a big driver for the target variable in this particular model**


- **If the coefficient value is < 1 , that implies that increase in value of that feature will decrease the odds that target variable is in positive class**