In [None]:
import numpy as np
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
# import logistic_regression_util

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

#### Logistic Regression
- Fundamentals: 
 https://docs.google.com/presentation/d/1AzgB6opDhEuAdBHZS8GRbBV6BtQCqb9JSAElM4-H6nk/edit?usp=sharing
- logistic regression in sklearn

Pros and Cons

In [None]:
from pydataset import data

df = data('iris')
df.head()

In [None]:
# columns name change
df.columns = [col.lower().replace('.', '_') for col in df]
df.columns

In [None]:
# Binary classification - predict if species is non-virginica or virginica

In [None]:
# change setosa and versicolor to '0' and virginica to 1

df['species'] = np.where(df.species == 'virginica', '1', '0')

In [None]:
df.species.value_counts()

## Predict if species is virginica or not

In [None]:
def train_validate_test_split(df, target, seed=123):
    '''
    This function takes in a dataframe, the name of the target variable
    (for stratification purposes), and an integer for a setting a seed
    and splits the data into train, validate and test. 
 
    '''
    train_validate, test = train_test_split(df, test_size=0.2, 
                                            random_state=seed, 
                                            stratify=df[target])
    train, validate = train_test_split(train_validate, test_size=0.3, 
                                       random_state=seed,
                                       stratify=train_validate[target])
    return train, validate, test

In [None]:
train, validate, test = train_validate_test_split(df,
                                                  target = 'species',
                                                  seed=123)

In [None]:
train.shape, validate.shape, test.shape

In [None]:
train.head()

In [None]:
# Make new dataframes
X_train = train.drop(columns=['species'])
y_train = train.species

X_validate = validate.drop(columns=['species'])
y_validate = validate.species

X_test = test.drop(columns=['species'])
y_test = test.species

In [None]:
X_train.head()

# Model 1

In [None]:
# Define the logistic regression model
logit = LogisticRegression(C=1, random_state=123)

In [None]:
#  fit the model on train data
logit.fit(X_train, y_train)

In [None]:
# now use the model to make predictions
y_pred = logit.predict(X_train)

In [None]:
#take a look at predictions
y_pred

In [None]:
# look at predicted probabilites for first 10 observations
logit.predict_proba(X_train)[:10]

In [None]:
logit.classes_

In [None]:
# View raw probabilities (output from the model)

y_pred_proba = logit.predict_proba(X_train)
y_pred_proba = pd.DataFrame(y_pred_proba, columns = ['non-virginca', 'virginica'])
y_pred_proba.head().round(3)

In [None]:
# classification report
print(classification_report(y_train, y_pred))

## Model 2

In [None]:
# Change hyperparameter C = 0.01

logit2 = LogisticRegression(C=.01 ,random_state=123)

In [None]:
# fit the model
logit2.fit(X_train, y_train)

In [None]:
# make prediction
y_pred2 = logit2.predict(X_train)

In [None]:
#classification report
print(classification_report(y_train, y_pred2))

## Evaluate Model 1 and 2 performance on 'Validate'

In [None]:
# Make prediction for validate dataset

y_pred_validate = logit.predict(X_validate)
y_pred_validate2 = logit2.predict(X_validate)

In [None]:
print("Model 1: solver = lbfgs, c = 1")

print('Accuracy: {:.2f}'.format(logit.score(X_validate, y_validate)))

print(classification_report(y_validate, y_pred_validate))

print('--------------------------------------------------')

print("Model 2: solver = lbfgs, c = .01")

print('Accuracy: {:.2f}'.format(logit2.score(X_validate, y_validate)))

print(classification_report(y_validate, y_pred_validate2))

### Hyperparameters
#### Regularization:
- Keep model simple
- Constraints the coefficients
- Discourages learning more complex model
- Minimizes overfitting
- L1 - Lasso
- L2 - Ridge

#### C = Inverse of regularization strength:

- Lower C - higher regularization
- Lower C discourages learning more complex model
- minimizes overfitting

## Bonus: Interpreting model coefficients

In [None]:
# look at model 1 coefficents
 
print('Coefficient: \n', logit.coef_[0])

#### Logistic Regression basics:

log(odds) = log(p/(1-p)) = $intercept$ + ($\beta_1$ * variable1) + ($\beta_2$ * variable2) + ($\beta_3$ * variable3)

**The coefficients above represents 'log odds'**

In [None]:
# Make a dataframe of coefficients and feature names

log_coeffs = pd.DataFrame(logit.coef_[0], index = X_train.columns,
                          columns = ['coeffs']).sort_values(by = 'coeffs', ascending = True)
log_coeffs

**It would be helpful to convert 'log odds' to 'odds'**

In [None]:
# convert from log odds to odds (exponentiate)
odds = np.exp(log_coeffs)
odds

What is odds?

odds = P(occurring) / P(not occurring)  = p / (1-p)

Toss a fair coin
odds = 0.5 / (1-0.5) = 1   i.e. Odd of landing tails vs heads is 1:1 for fair coin

Rolling 2 or higher on a dice roll  
odd = (5/6) /  (1/6) = 5 i.e. Odd of rolling a 2 or higher on a dice is 5:1 for a fair die

#### Coefficient Interpretation (odds):


- **Example: petal_length: For every one unit increase in petal_length, we expect 10 times increase in odds of being a 'virginica' vs a 'non-virginica'.**


- **If the coefficient (odds) is 1 or close to 1, this means odds of being in class '1' (positive class) is same or close to being in class '0' (negative class). This means the feature with this coefficient is not a big driver for the target variable in this particular model**

- **If the coefficient value is < 1 , that implies that increase in value of that feature will decrease the odds that target variable is in positive class**