# Logistic regression

---

**Purpose of the Model:**
- Used for classification problems, where the goal is to predict a category or class. It is primarily used for binary classification problems (e.g., predicting if an email is spam or not, or if a person is likely to develop a disease or not). The output is a probability that is mapped to a class (0 or 1).

**Type of Output:**

- Produces a probability value between 0 and 1. A threshold (such as 0.5) is usually defined to classify probabilities into one of the two classes.

**Output Graph:**

- The output is an S-shaped sigmoid curve, flattening as it approaches 0 and 1, indicating the transition of probabilities.

---

Imported Libraries

In [None]:
# Data processing
# ==================================================================================
import pandas as pd
import numpy as np

# Preprocessing and modeling
# ==================================================================================
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Warnings Configuration
# ==================================================================================
import warnings
import warnings

def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn # ignore annoying warning (from sklearn and seaborn)

### Step 1: Decision making: Which is the best dataset?

In [3]:
# Train data frames
X_train_with_outliers_sel = pd.read_csv('../data/processed/X_train_with_outliers_sel.csv')
X_train_without_outliers_sel = pd.read_csv('../data/processed/X_train_without_outliers_sel.csv')
X_train_with_outliers_norm_sel = pd.read_csv('../data/processed/X_train_with_outliers_norm_sel.csv')
X_train_without_outliers_norm_sel = pd.read_csv('../data/processed/X_train_without_outliers_norm_sel.csv')
X_train_with_outliers_minmax_sel = pd.read_csv('../data/processed/X_train_with_outliers_minmax_sel.csv')
X_train_without_outliers_minmax_sel = pd.read_csv('../data/processed/X_train_without_outliers_minmax_sel.csv')
y_train = pd.read_csv('../data/processed/y_train.csv')

# Test data frames
X_test_with_outliers_sel = pd.read_csv('../data/processed/X_test_with_outliers_sel.csv')
X_test_without_outliers_sel = pd.read_csv('../data/processed/X_test_without_outliers_sel.csv')
X_test_with_outliers_norm_sel = pd.read_csv('../data/processed/X_test_with_outliers_norm_sel.csv')
X_test_without_outliers_norm_sel = pd.read_csv('../data/processed/X_test_without_outliers_norm_sel.csv')
X_test_with_outliers_minmax_sel = pd.read_csv('../data/processed/X_test_with_outliers_minmax_sel.csv')
X_test_without_outliers_minmax_sel = pd.read_csv('../data/processed/X_test_without_outliers_minmax_sel.csv')
y_test = pd.read_csv('../data/processed/y_test.csv')

In [4]:
train_dicts = {
  "X_train_with_outliers_sel": X_train_with_outliers_sel,
  "X_train_without_outliers_sel": X_train_without_outliers_sel,
  "X_train_with_outliers_norm_sel": X_train_with_outliers_norm_sel,
  "X_train_without_outliers_norm_sel": X_train_without_outliers_norm_sel,
  "X_train_with_outliers_minmax_sel": X_train_with_outliers_minmax_sel,
  "X_train_without_outliers_minmax_sel": X_train_without_outliers_minmax_sel
}

test_dicts = {
  "X_test_with_outliers_sel": X_test_with_outliers_sel,
  "X_test_without_outliers_sel": X_test_without_outliers_sel,
  "X_test_with_outliers_norm_sel": X_test_with_outliers_norm_sel,
  "X_test_without_outliers_norm_sel": X_test_without_outliers_norm_sel,
  "X_test_with_outliers_minmax_sel": X_test_with_outliers_minmax_sel,
  "X_test_without_outliers_minmax_sel": X_test_without_outliers_minmax_sel
}

train_dfs = [
  X_train_with_outliers_sel,
  X_train_without_outliers_sel,
  X_train_with_outliers_norm_sel,
  X_train_without_outliers_norm_sel,
  X_train_with_outliers_minmax_sel,
  X_train_without_outliers_minmax_sel
]
test_dfs = [
  X_test_with_outliers_sel,
  X_test_without_outliers_sel,
  X_test_with_outliers_norm_sel,
  X_test_without_outliers_norm_sel,
  X_test_with_outliers_minmax_sel,
  X_test_without_outliers_minmax_sel
]

results = []

for df_index in range(len(train_dfs)):
  model = LogisticRegression()
  train_df = train_dfs[df_index]
  model.fit(train_df, y_train)
  y_train_pred = model.predict(train_df)
  y_test_pred = model.predict(test_dfs[df_index])

  results.append(
    {
        "index": df_index,
        "train_df": list(train_dicts.keys())[df_index],
        "train_score": accuracy_score(y_train, y_train_pred),
        "test_score": accuracy_score(y_test, y_test_pred)
    }
  )

resultados = sorted(results, key = lambda x: x["train_score"], reverse = True)
resultados

[{'index': 2,
  'train_df': 'X_train_with_outliers_norm_sel',
  'train_score': 0.9092289010321797,
  'test_score': 0.9031083050024283},
 {'index': 4,
  'train_df': 'X_train_with_outliers_minmax_sel',
  'train_score': 0.9086520947176685,
  'test_score': 0.9022583778533269},
 {'index': 0,
  'train_df': 'X_train_with_outliers_sel',
  'train_score': 0.9055555555555556,
  'test_score': 0.897887323943662},
 {'index': 3,
  'train_df': 'X_train_without_outliers_norm_sel',
  'train_score': 0.9051912568306011,
  'test_score': 0.9011656143759106},
 {'index': 5,
  'train_df': 'X_train_without_outliers_minmax_sel',
  'train_score': 0.9044930176077717,
  'test_score': 0.9011656143759106},
 {'index': 1,
  'train_df': 'X_train_without_outliers_sel',
  'train_score': 0.9022161505768063,
  'test_score': 0.896551724137931}]

In [5]:
print (f"The best train dataframe is |{resultados[0]['train_df']}|.\n\
============================================================\n\
| Train score: {resultados[0]['train_score']}   |\n\
-------------------------------------\n\
| Test score: {resultados[0]['test_score']}    |\n\
=====================================")


The best train dataframe is |X_train_with_outliers_norm_sel|.
| Train score: 0.9092289010321797   |
-------------------------------------
| Test score: 0.9031083050024283    |


## Step 2: Model hyperparameters optimizacion

- ### 2.1 Grid SearchCV

In [6]:
model = LogisticRegression()
model.fit(train_dfs[2], y_train)
y_pred = model.predict(test_dfs[2])

base_accuracy = accuracy_score(y_test, y_pred)
base_accuracy

0.9031083050024283

In [7]:
from sklearn.model_selection import GridSearchCV

# We define the parameters that we want to adjust by hand
hyperparams = {
    "C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    "penalty": ["l1", "l2", "elasticnet", None],
    "solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"]
}

# We initialize the grid
grid = GridSearchCV(model, hyperparams, scoring = "accuracy", cv = 5)
grid

In [8]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

grid.fit(train_dfs[2], y_train)

print(f"Best hyperparameters: {grid.best_params_}")

Best hyperparameters: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}


In [9]:
model_grid = LogisticRegression(penalty = "l2", C = 0.1, solver = "lbfgs")
model_grid.fit(train_dfs[2], y_train)
y_pred = model_grid.predict(test_dfs[2])

grid_accuracy = accuracy_score(y_test, y_pred)
grid_accuracy

0.902622632345799

---

- ### 2.2 Random search

In [10]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV

# We define the parameters we want to adjust
hyperparams = {
    "C": np.logspace(-4, 4, 20),
    "penalty": ["l1", "l2", "elasticnet", None],
    "solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"]
}

# We initialize the random search
random_search = RandomizedSearchCV(model, hyperparams, n_iter = 100, scoring = "accuracy", cv = 5, random_state = 42)
random_search

In [11]:
random_search.fit(train_dfs[2], y_train)

print(f"Best hyperparameters: {random_search.best_params_}")

Best hyperparameters: {'solver': 'sag', 'penalty': 'l2', 'C': 0.08858667904100823}


In [12]:
model_random_search = LogisticRegression(penalty = "l2", C = 0.08858667904100823, solver = "sag")
model_random_search.fit(train_dfs[2], y_train)
y_pred = model_random_search.predict(test_dfs[2])

random_search_accuracy = accuracy_score(y_test, y_pred)
random_search_accuracy

0.9025012141816415