# Ridge Regularization(aka L2 Regularization)

In [5]:
import pandas as pd

In [11]:
# upload data
pima_diab= pd.read_csv(r"C:\Users\acreddy\Desktop\abc\LogisticReg-Binary_multiclass\data\diabetes.csv")

In [16]:
# change the target column name from outcome to class for better readability
# pima_diab.columns = pima_diab.columns.str.replace("Outcome", "Class")
pima_diab.rename(columns={"Outcome": "Class"}, inplace=True)

In [17]:
pima_diab.head(2)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0


In [18]:
# split the data in to target and independent variables
X = pima_diab.drop("Class", axis=1)
y = pima_diab["Class"]

In [19]:
# split the data in to train and test 
from sklearn.model_selection import train_test_split

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [21]:
# lets scale the data
from sklearn.preprocessing import StandardScaler

In [22]:
# initialize the StandardScaler
scaler= StandardScaler()

In [23]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [24]:
# since we have imbalanced classes and also data size is small we will do over sampling using SMOTE
from imblearn.over_sampling import SMOTE

In [25]:
# initiate smote
smote = SMOTE(sampling_strategy="auto", random_state=42)

In [26]:
X_train_scaled_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

In [28]:
from sklearn.linear_model import LogisticRegression

In [46]:
# apply ridge regression(L2) penalty (since the default "lbfgs" optimizer doesnt support L1 regularization i am using L2 only)
# Note: here C value is nothing but Alpha value but its a "inverse regularization strength" i,e C= 1/Alpha
model_ridge_regular= LogisticRegression(penalty="l2", max_iter=500, C=10, random_state=42)

In [47]:
# fit the model on scaled and resampled train data
model_ridge_regular.fit(X_train_scaled_resampled, y_train_resampled)

LogisticRegression(C=10, max_iter=500, random_state=42)

In [48]:
# evaluate the model on scaled test data
y_pred_ridge = model_ridge_regular.predict(X_test_scaled)

In [49]:
# lets get the metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [50]:
accuracy = accuracy_score(y_test, y_pred_ridge)
conf_matrix = confusion_matrix(y_test, y_pred_ridge)
class_report = classification_report(y_test, y_pred_ridge)

In [51]:
accuracy

0.6875

In [52]:
conf_matrix

array([[83, 40],
       [20, 49]], dtype=int64)

In [54]:
print(class_report)

              precision    recall  f1-score   support

           0       0.81      0.67      0.73       123
           1       0.55      0.71      0.62        69

    accuracy                           0.69       192
   macro avg       0.68      0.69      0.68       192
weighted avg       0.71      0.69      0.69       192



Note: the above result accuracy may be fallend down but the TPR aka Recall of positive class (class_1) is increased i,e your model predicting the correct class increased.

# Lasso Regularization(aka L1 Regularization)

Note: since the default "lbfgs" optimizer doesnt support L1 regularization here I am using "saga"( you can use "liblinear", "sag", "saga" or "netown-clg")

In [69]:
# initiate the model
model_lasso_regular= LogisticRegression(solver="saga",C= 0.5 , penalty="l1", max_iter=500, random_state=42,)

In [70]:
# fit the model on scaled and resampled train data
model_lasso_regular.fit(X_train_scaled_resampled, y_train_resampled)

LogisticRegression(C=0.5, max_iter=500, penalty='l1', random_state=42,
                   solver='saga')

In [71]:
# evaluate the model on scaled test data
y_pred_lasso = model_lasso_regular.predict(X_test_scaled)

In [75]:
# get the metrics
accuracy_l1 = accuracy_score(y_test, y_pred_lasso)
conf_matrix_l1 = confusion_matrix(y_test, y_pred_lasso)
class_report_l1 = classification_report(y_test, y_pred_lasso)

In [73]:
accuracy_l1

0.6927083333333334

In [74]:
conf_matrix_l1

array([[84, 39],
       [20, 49]], dtype=int64)

In [76]:
print(class_report_l1)

              precision    recall  f1-score   support

           0       0.81      0.68      0.74       123
           1       0.56      0.71      0.62        69

    accuracy                           0.69       192
   macro avg       0.68      0.70      0.68       192
weighted avg       0.72      0.69      0.70       192



Note: this regularization increased precision of class_1 and recall for class_0