In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Importing the method to apply logistic regression

from sklearn.linear_model import LogisticRegression

In [3]:
# Importing libraries to evaluate the quality of the model

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score

from sklearn.metrics import confusion_matrix

from sklearn.metrics import classification_report

# Logistic Regression applied to the Caravan dataset

In [4]:
Caravan_df= pd.read_csv('C:\\Users\\jheredi2\\Documents\\PythonDataAnalytics\\1-Datasets\\Caravan.csv')

In [5]:
# Excluding the first predictor from the analysis

X_train, X_test, y_train, y_test= train_test_split (Caravan_df.iloc[:,1:-1], Caravan_df['Purchase'], test_size=0.2, random_state=1)

##### The LogisticRegression() method in scikit-learn

- There are multiple hyperparameters that can be tuned when this method is applied. For example:

a) Whether regularization is applied or not, and what kind of regularization is applied (L1, L2, L1 and L2 combined)

b) The value of the regularization parameter (lamda)

c) The algortithm/method used to obtain the maximum for the likelihood function

d) max_iter: Max number of iterations that the algortithm/method will run to find the maximum of the likelihood function

And many other hyperparameters


- Initially, we are going to apply Logistic Regression with the default hyperparameter values. Later, we will learn how to apply CV to select good values for some of these hyperparameters.

<br>

- The default values for some hyperparameters are:

Regularization: L2

Lamda= 1

Algorithm/method used to obtain the maximum for the likelihood function: 'lbfgs' algorithm (the limited-memory Broyden–Fletcher–Goldfarb–Shanno algorithm)

max_iter = 100

In [8]:
logistic_model= LogisticRegression(max_iter= 1000)

__Note__: I tried using the default value for max_iter= 100, but the algorithm couldn't find a maximum. Then, I tried with 500 and it did not work either. However, 1000 iterations did work!

In [9]:
logistic_model.fit(X_train, y_train)

LogisticRegression(max_iter=1000)

In [10]:
y_predicted_test= logistic_model.predict(X_test)

In [11]:
confusion_matrix(y_test, y_predicted_test)

array([[1098,    3],
       [  64,    0]], dtype=int64)

It is not even worth to obtain the classification report because this classifier never predicts the Yes class correctly. But, in any case, let's do it.

In [12]:
print (classification_report (y_test, y_predicted_test))

              precision    recall  f1-score   support

          No       0.94      1.00      0.97      1101
         Yes       0.00      0.00      0.00        64

    accuracy                           0.94      1165
   macro avg       0.47      0.50      0.49      1165
weighted avg       0.89      0.94      0.92      1165



### Changing the probability threshold to see if we get better results!

Let's select the best threshold based on f1-score

In [6]:
from sklearn.metrics import f1_score

In [14]:
# Create an array with all the probabilities of Yes

prob_yes= logistic_model.predict_proba(X_test)[:,1]

In [15]:
array_prob= np.arange(0.05, 0.51, 0.05)

In [16]:
dict_predictions = dict()

In [17]:
dict_f1_scores= dict()

In [18]:
for j in array_prob:
    dict_predictions[j]=np.empty(y_test.size, dtype=object)
    for i in np.arange(0, dict_predictions[j].size):
        if prob_yes[i] > j:
            dict_predictions[j][i]= 'Yes'
        else:
            dict_predictions[j][i]= 'No'
    dict_f1_scores[j]= np.round (f1_score(y_test, dict_predictions[j],pos_label='Yes'),3)

In [19]:
dict_f1_scores

{0.05: 0.176,
 0.1: 0.191,
 0.15000000000000002: 0.203,
 0.2: 0.14,
 0.25: 0.124,
 0.3: 0.071,
 0.35000000000000003: 0.052,
 0.4: 0.056,
 0.45: 0.0,
 0.5: 0.0}

In [18]:
max(dict_f1_scores, key= dict_f1_scores.get)

0.15000000000000002

In [19]:
max(dict_f1_scores.values())

0.203

In [20]:
# This loop computes the prediction of Y (No or Yes) for each test observation
# The predictions of Y are stored in an array called 'y_predicted_prob015' 
# The prediction uses a prob threshold of 0.15

y_predicted_prob015=np.empty(y_test.size, dtype=object)

for i in np.arange(0,y_predicted_prob015.size):
    if prob_yes[i] > 0.15:
        y_predicted_prob015[i]= 'Yes'
    else:
        y_predicted_prob015[i]= 'No'

In [21]:
confusion_matrix (y_test, y_predicted_prob015)

array([[1006,   95],
       [  46,   18]], dtype=int64)

In [22]:
print (classification_report (y_test, y_predicted_prob015))

              precision    recall  f1-score   support

          No       0.96      0.91      0.93      1101
         Yes       0.16      0.28      0.20        64

    accuracy                           0.88      1165
   macro avg       0.56      0.60      0.57      1165
weighted avg       0.91      0.88      0.89      1165



### Changing the class weights to see if we get better results!

__From the scikit-learn online documentation:__

class_weight: dict or ‘balanced’, default=None

Weights associated with classes in the form {class_label: weight}. If not given, all classes are supposed to have weight one.

The “balanced” mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as:

n_samples / (n_classes * np.bincount(y))

In [24]:
# n_samples
X_train.shape[0]

4657

In [25]:
# n_classes= 2

In [30]:
np.array(y_train.value_counts())

array([4373,  284], dtype=int64)

In [31]:
X_train.shape[0]/(2*np.array(y_train.value_counts()))

array([0.53247199, 8.19894366])

In [32]:
8.19/0.53

15.452830188679243

__How does changing the class weights influences the classifier?__

"The difference in weights will influence the classification of the classes during the training phase. The whole purpose is to penalize the misclassification made by the minority class by setting a higher class weight and at the same time reducing weight for the majority class."

"It penalizes mistakes in samples of class[i] with class_weight[i]. So higher class-weight means you want to put more emphasis on a class."

In [33]:
logistic_model2= LogisticRegression(max_iter= 10000, class_weight='balanced')

In [34]:
logistic_model2.fit(X_train, y_train)

LogisticRegression(class_weight='balanced', max_iter=10000)

In [36]:
y_predicted_test2= logistic_model2.predict(X_test)

In [37]:
confusion_matrix (y_test, y_predicted_test2)

array([[795, 306],
       [ 25,  39]], dtype=int64)

In [38]:
print (classification_report (y_test, y_predicted_test2))

              precision    recall  f1-score   support

          No       0.97      0.72      0.83      1101
         Yes       0.11      0.61      0.19        64

    accuracy                           0.72      1165
   macro avg       0.54      0.67      0.51      1165
weighted avg       0.92      0.72      0.79      1165



### Standardizing the predictors before applying the classification technique

__Standardizing__ means converting the values of a variable into Z scores by subtracting the mean and dividing by the standard deviation.

A different operation is __normalizing__, which entails scaling the values of a variable in the [0, 1] range by substracting the minimun from each value and dividing by the range.

Standardizing is most commonly 'required' than normalizing

__From the scikit-learn online documentation:__

Standardization of datasets is a common requirement __for many__ machine learning estimators implemented in scikit-learn; they might behave badly if the individual features do not more or less look like standard normally distributed data: Gaussian with zero mean and unit variance.

In practice we often ignore the shape of the distribution and just transform the data to center it by removing the mean value of each feature, then scale it by dividing non-constant features by their standard deviation.

For instance, __many elements used in the objective function of a learning algorithm__ (such as the RBF kernel of Support Vector Machines or __the l1 and l2 regularizers of linear models__) may assume that all features are centered around zero or have variance in the same order. If a feature has a variance that is orders of magnitude larger than others, it might dominate the objective function and make the estimator unable to learn from other features correctly as expected.


Because the default application of Logistic Regression in scikit-learn uses L2 regularization, we should standardized the predictors before applying this technique.

Note: It is not recommended to stardandized the predictors before splitting the data into training and testing.

So, in principle you can split the dataset and afterwards standardize each part (training and test) individually 

Or ...

You can standardize as part of a pipeline applicationin scikit-learn (RECOMMENDED!)

In [7]:
from sklearn.pipeline import make_pipeline

# The method StandardScaler() is the one used to standardize the predictors

from sklearn.preprocessing import StandardScaler

In [51]:
pipe_caravan = make_pipeline(StandardScaler(), LogisticRegression(max_iter= 1000))

In [52]:
pipe_caravan.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression(max_iter=1000))])

In [42]:
y_predicted_test3= pipe_caravan.predict(X_test)

In [43]:
confusion_matrix (y_test, y_predicted_test3)

array([[1097,    4],
       [  64,    0]], dtype=int64)

In [44]:
print (classification_report (y_test, y_predicted_test3))

              precision    recall  f1-score   support

          No       0.94      1.00      0.97      1101
         Yes       0.00      0.00      0.00        64

    accuracy                           0.94      1165
   macro avg       0.47      0.50      0.48      1165
weighted avg       0.89      0.94      0.92      1165



### Standardizing and changing the weights

In [53]:
pipe_caravan2 = make_pipeline(StandardScaler(), LogisticRegression(max_iter= 1000, class_weight='balanced'))

In [54]:
pipe_caravan2.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression',
                 LogisticRegression(class_weight='balanced', max_iter=1000))])

In [55]:
y_predicted_test4= pipe_caravan2.predict(X_test)

In [56]:
confusion_matrix (y_test, y_predicted_test4)

array([[793, 308],
       [ 26,  38]], dtype=int64)

In [57]:
print (classification_report (y_test, y_predicted_test4))

              precision    recall  f1-score   support

          No       0.97      0.72      0.83      1101
         Yes       0.11      0.59      0.19        64

    accuracy                           0.71      1165
   macro avg       0.54      0.66      0.51      1165
weighted avg       0.92      0.71      0.79      1165



### Standardizing and changing the probability threshold

In [58]:
pipe_caravan3 = make_pipeline(StandardScaler(), LogisticRegression(max_iter= 1000))

In [59]:
pipe_caravan3.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression(max_iter=1000))])

In [60]:
prob_yes2= pipe_caravan3.predict_proba(X_test)[:,1]

In [61]:
dict_predictions2={}

In [62]:
dict_f1_scores2={}

In [63]:
for j in array_prob:
    dict_predictions2[j]=np.empty(y_test.size, dtype=object)
    for i in np.arange(0, dict_predictions2[j].size):
        if prob_yes2[i] > j:
            dict_predictions2[j][i]= 'Yes'
        else:
            dict_predictions2[j][i]= 'No'
    dict_f1_scores2[j]= np.round (f1_score(y_test, dict_predictions2[j],pos_label='Yes'),3)

In [64]:
dict_f1_scores2

{0.05: 0.174,
 0.1: 0.181,
 0.15000000000000002: 0.201,
 0.2: 0.168,
 0.25: 0.143,
 0.3: 0.067,
 0.35000000000000003: 0.05,
 0.4: 0.053,
 0.45: 0.0,
 0.5: 0.0}

In [65]:
y_predicted_prob015_2=np.empty(y_test.size, dtype=object)

for i in np.arange(0,y_predicted_prob015_2.size):
    if prob_yes2[i] > 0.15:
        y_predicted_prob015_2[i]= 'Yes'
    else:
        y_predicted_prob015_2[i]= 'No'

In [66]:
confusion_matrix (y_test, y_predicted_prob015_2)

array([[1004,   97],
       [  46,   18]], dtype=int64)

In [67]:
print (classification_report (y_test, y_predicted_prob015_2))

              precision    recall  f1-score   support

          No       0.96      0.91      0.93      1101
         Yes       0.16      0.28      0.20        64

    accuracy                           0.88      1165
   macro avg       0.56      0.60      0.57      1165
weighted avg       0.91      0.88      0.89      1165



__START HERE ON TUESDAY__

### Tuning hyperparameters via CV

We are going to apply GridSearchCV() to __tune two parameters: Lamda__ (the regularization parameter) and __the class weights__

We tune the class weights in case we do not want to use the default weights used by scikit-learn. As we saw before, the default is class_weight= 'balanced'

__Lambda__: The penalty parameter

In scikit-learn, we do not directly tune lambda but the inverse of lambda, a parameter called C. 

Lambda= 1/ C

See next from the scikit-learn documentation:

C float, default=1.0

Inverse of regularization strength; must be a positive float. Smaller values of C specify stronger regularization.

In [8]:
from sklearn.model_selection import GridSearchCV

In [9]:
pipe_caravan_CVSearch = make_pipeline(StandardScaler(), LogisticRegression())

In [10]:
# Values of C that we are trying

C_hyper= np.linspace(0.001, 1, 50)
C_hyper

array([0.001     , 0.02138776, 0.04177551, 0.06216327, 0.08255102,
       0.10293878, 0.12332653, 0.14371429, 0.16410204, 0.1844898 ,
       0.20487755, 0.22526531, 0.24565306, 0.26604082, 0.28642857,
       0.30681633, 0.32720408, 0.34759184, 0.36797959, 0.38836735,
       0.4087551 , 0.42914286, 0.44953061, 0.46991837, 0.49030612,
       0.51069388, 0.53108163, 0.55146939, 0.57185714, 0.5922449 ,
       0.61263265, 0.63302041, 0.65340816, 0.67379592, 0.69418367,
       0.71457143, 0.73495918, 0.75534694, 0.77573469, 0.79612245,
       0.8165102 , 0.83689796, 0.85728571, 0.87767347, 0.89806122,
       0.91844898, 0.93883673, 0.95922449, 0.97961224, 1.        ])

In [11]:
# Values of lambda that we are trying

1/C_hyper

array([1000.        ,   46.75572519,   23.93746947,   16.08667104,
         12.11372064,    9.71451229,    8.10855535,    6.9582505 ,
          6.09376943,    5.42035398,    4.88096424,    4.43921   ,
          4.07078176,    3.75882172,    3.49127182,    3.25927897,
          3.05619659,    2.87693753,    2.71754201,    2.57488177,
          2.44645264,    2.33022636,    2.22454261,    2.12802918,
          2.03954214,    1.9581202 ,    1.8829497 ,    1.81333728,
          1.74868848,    1.6884907 ,    1.63229954,    1.5797279 ,
          1.53043696,    1.48412891,    1.44054094,    1.39944022,
          1.36061978,    1.32389495,    1.28910052,    1.25608818,
          1.22472444,    1.1948888 ,    1.16647225,    1.1393759 ,
          1.11350983,    1.08879211,    1.06514793,    1.04250883,
          1.02081207,    1.        ])

In [26]:
weights = np.round (np.linspace(0.0,1,11), 2)
weights

array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])

_Random question:_ In the next cell, think about another way of assigning the class weight value (a way that does not use a loop)

In [13]:
hyperparam_grid = {
    'logisticregression__C': C_hyper,
    'logisticregression__class_weight': [{'No':x, 'Yes':1.0-x} for x in weights],
    'logisticregression__max_iter':[10000]
}

In [14]:
grid_search= GridSearchCV(estimator= pipe_caravan_CVSearch, param_grid=hyperparam_grid, cv=5)

##### DO NOT RUN THE NEXT CELL. JUST OBSERVER RESULTS!

In [15]:
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('logisticregression',
                                        LogisticRegression())]),
             param_grid={'logisticregression__C': array([0.001     , 0.02138776, 0.04177551, 0.06216327, 0.08255102,
       0.10293878, 0.12332653, 0.14371429, 0.16410204, 0.1844898 ,
       0.20487755, 0.22526531, 0.24565306, 0.26604082, 0.28642857,
       0.30681633, 0.32720408,...
                         'logisticregression__class_weight': [{'No': 0.0,
                                                               'Yes': 1.0},
                                                              {'No': 0.1,
                                                               'Yes': 0.9},
                                                              {'No': 0.2,
                                                               'Yes': 0.8},
                                          

In [16]:
grid_search.best_params_

{'logisticregression__C': 0.021387755102040818,
 'logisticregression__class_weight': {'No': 0.7, 'Yes': 0.30000000000000004},
 'logisticregression__max_iter': 10000}

In [30]:
pipe_caravan4 = make_pipeline(StandardScaler(), LogisticRegression(max_iter= 10000, class_weight={'No': 0.7, 'Yes': 0.3}, C=0.021387755102040818))

In [31]:
pipe_caravan4.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression',
                 LogisticRegression(C=0.021387755102040818,
                                    class_weight={'No': 0.7, 'Yes': 0.3},
                                    max_iter=10000))])

In [32]:
y_predicted_test5= pipe_caravan4.predict(X_test)

In [33]:
confusion_matrix (y_test, y_predicted_test5)

array([[1100,    1],
       [  64,    0]], dtype=int64)

In [34]:
print (classification_report (y_test, y_predicted_test5))

              precision    recall  f1-score   support

          No       0.95      1.00      0.97      1101
         Yes       0.00      0.00      0.00        64

    accuracy                           0.94      1165
   macro avg       0.47      0.50      0.49      1165
weighted avg       0.89      0.94      0.92      1165



The GridSearch CV was optimizing the overall accuracy. We might want to run a CV grid search to search for the best f1-score

### Repeat Grid Search but using f1-score as the scoring function in CV

In [22]:
from sklearn.metrics import make_scorer

In [23]:
f1_scorer = make_scorer(f1_score, pos_label='Yes')

In [27]:
grid_search2= GridSearchCV(estimator= pipe_caravan_CVSearch, param_grid=hyperparam_grid, cv=5, scoring=f1_scorer)

##### DO NOT RUN THE NEXT CELL. JUST OBSERVER RESULTS!

In [28]:
grid_search2.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('logisticregression',
                                        LogisticRegression())]),
             param_grid={'logisticregression__C': array([0.001     , 0.02138776, 0.04177551, 0.06216327, 0.08255102,
       0.10293878, 0.12332653, 0.14371429, 0.16410204, 0.1844898 ,
       0.20487755, 0.22526531, 0.24565306, 0.26604082, 0.28642857,
       0.30681633, 0.32720408,...
                                                               'Yes': 1.0},
                                                              {'No': 0.1,
                                                               'Yes': 0.9},
                                                              {'No': 0.2,
                                                               'Yes': 0.8},
                                                              {'No': 0.3,
                                          

In [29]:
grid_search2.best_params_

{'logisticregression__C': 0.021387755102040818,
 'logisticregression__class_weight': {'No': 0.1, 'Yes': 0.9},
 'logisticregression__max_iter': 10000}

In [35]:
pipe_caravan5 = make_pipeline(StandardScaler(), LogisticRegression(max_iter= 10000, class_weight={'No': 0.1, 'Yes': 0.9}, C=0.021387755102040818))

In [36]:
pipe_caravan5.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression',
                 LogisticRegression(C=0.021387755102040818,
                                    class_weight={'No': 0.1, 'Yes': 0.9},
                                    max_iter=10000))])

In [37]:
y_predicted_test6= pipe_caravan5.predict(X_test)

In [38]:
confusion_matrix (y_test, y_predicted_test6)

array([[939, 162],
       [ 39,  25]], dtype=int64)

In [39]:
print (classification_report (y_test, y_predicted_test6))

              precision    recall  f1-score   support

          No       0.96      0.85      0.90      1101
         Yes       0.13      0.39      0.20        64

    accuracy                           0.83      1165
   macro avg       0.55      0.62      0.55      1165
weighted avg       0.91      0.83      0.86      1165



# Logistic Regression applied to the Default dataset

In [59]:
Default_df= pd.read_csv('C:\\Users\\jheredi2\\Documents\\PythonDataAnalytics\\1-Datasets\\Default.csv')

In [60]:
Default_df_dummies= pd.get_dummies(Default_df,columns=['student'], drop_first=True)

In [61]:
X_train_def, X_test_def, y_train_def, y_test_def= train_test_split (Default_df_dummies.iloc[:,1:], Default_df['default'], test_size=0.2, random_state=1)

### Standardizing and changing probability threshold

In [62]:
pipe_default = make_pipeline(StandardScaler(), LogisticRegression(max_iter= 1000))

In [63]:
pipe_default.fit(X_train_def, y_train_def)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression(max_iter=1000))])

In [64]:
prob_yes_default= pipe_default.predict_proba(X_test_def)[:,1]

In [65]:
dict_predictions_default= dict()

In [66]:
dict_f1_scores_default= dict()

In [67]:
for j in array_prob:
    dict_predictions_default[j]=np.empty(y_test_def.size, dtype=object)
    for i in np.arange(0, dict_predictions_default[j].size):
        if prob_yes_default[i] > j:
            dict_predictions_default[j][i]= 'Yes'
        else:
            dict_predictions_default[j][i]= 'No'
    dict_f1_scores_default[j]= np.round (f1_score(y_test_def, dict_predictions_default[j],pos_label='Yes'),3)

In [68]:
dict_f1_scores_default

{0.05: 0.308,
 0.1: 0.359,
 0.15000000000000002: 0.381,
 0.2: 0.37,
 0.25: 0.394,
 0.3: 0.38,
 0.35000000000000003: 0.396,
 0.4: 0.388,
 0.45: 0.409,
 0.5: 0.409}

The highest f1-score happens with threshold=0.5, which is the default threshold used when predict() is called. So...

In [69]:
y_predicted_test_default= pipe_default.predict(X_test_def)

In [70]:
confusion_matrix (y_test_def, y_predicted_test_default)

array([[1930,   11],
       [  41,   18]], dtype=int64)

In [71]:
print (classification_report (y_test_def, y_predicted_test_default))

              precision    recall  f1-score   support

          No       0.98      0.99      0.99      1941
         Yes       0.62      0.31      0.41        59

    accuracy                           0.97      2000
   macro avg       0.80      0.65      0.70      2000
weighted avg       0.97      0.97      0.97      2000



### Standardizing and changing the weights (use "balanced" for the weights hyperparameter)

WORK INDEPENDENTLY FOR 10 MINUTES!!!

### Tuning hyperparameters via CV 

WORK INDEPENDENTLY FOR 10 MINUTES!!!