In [1]:
# Week 09 - Machine Learning with Scikit-learn
# Name: Arati Rajubhai Gohil
# Assignment: Logistic Regression Solver Comparison

In [2]:
# Import Libraries
import time
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.dummy import DummyClassifier

In [3]:
# Load the data
Pt_Analytics= pd.read_csv("/content/PatientAnalyticFile.csv")
Pt_Analytics.head()

Unnamed: 0,PatientID,DateOfBirth,Gender,Race,Myocardial_infarction,Congestive_heart_failure,Peripheral_vascular_disease,Stroke,Dementia,Pulmonary,...,Metastatic_solid_tumour,HIV,Obesity,Depression,Hypertension,Drugs,Alcohol,First_Appointment_Date,Last_Appointment_Date,DateOfDeath
0,1,1962-02-27,female,hispanic,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2013-04-27,2018-06-01,
1,2,1959-08-18,male,white,0,0,0,0,0,0,...,0,0,0,0,1,0,0,2005-11-30,2008-11-02,2008-11-02
2,3,1946-02-15,female,white,0,0,0,0,0,0,...,0,1,0,0,1,0,0,2011-11-05,2015-11-13,
3,4,1979-07-27,female,white,0,0,0,0,0,1,...,0,0,0,0,0,0,0,2010-03-01,2016-01-17,2016-01-17
4,5,1983-02-19,female,hispanic,0,0,0,0,0,0,...,0,0,0,0,1,0,0,2006-09-22,2018-06-01,


In [4]:
Pt_Analytics.shape

(20000, 29)

In [5]:
Pt_Analytics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 29 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   PatientID                       20000 non-null  int64 
 1   DateOfBirth                     20000 non-null  object
 2   Gender                          20000 non-null  object
 3   Race                            20000 non-null  object
 4   Myocardial_infarction           20000 non-null  int64 
 5   Congestive_heart_failure        20000 non-null  int64 
 6   Peripheral_vascular_disease     20000 non-null  int64 
 7   Stroke                          20000 non-null  int64 
 8   Dementia                        20000 non-null  int64 
 9   Pulmonary                       20000 non-null  int64 
 10  Rheumatic                       20000 non-null  int64 
 11  Peptic_ulcer_disease            20000 non-null  int64 
 12  LiverMild                       20000 non-null

In [6]:
Pt_Analytics.columns

Index(['PatientID', 'DateOfBirth', 'Gender', 'Race', 'Myocardial_infarction',
       'Congestive_heart_failure', 'Peripheral_vascular_disease', 'Stroke',
       'Dementia', 'Pulmonary', 'Rheumatic', 'Peptic_ulcer_disease',
       'LiverMild', 'Diabetes_without_complications',
       'Diabetes_with_complications', 'Paralysis', 'Renal', 'Cancer',
       'LiverSevere', 'Metastatic_solid_tumour', 'HIV', 'Obesity',
       'Depression', 'Hypertension', 'Drugs', 'Alcohol',
       'First_Appointment_Date', 'Last_Appointment_Date', 'DateOfDeath'],
      dtype='object')

In [7]:
# Create a new column 'Diabetes_Status'
# 0 = No diabetes
# 1 = Diabetes without complications
# 2 = Diabetes with complications

Pt_Analytics['Diabetes_Status'] = np.where(
    Pt_Analytics['Diabetes_with_complications'] == 1, 2,
    np.where(Pt_Analytics['Diabetes_without_complications'] == 1, 1, 0))

In [8]:
columns_to_drop = [
    'PatientID', 'DateOfBirth', 'First_Appointment_Date',
    'Last_Appointment_Date', 'DateOfDeath',
    'Diabetes_with_complications', 'Diabetes_without_complications'
]
Pt_Analytics.drop(columns=columns_to_drop, inplace=True)


In [9]:
Pt_Analytics.head()

Unnamed: 0,Gender,Race,Myocardial_infarction,Congestive_heart_failure,Peripheral_vascular_disease,Stroke,Dementia,Pulmonary,Rheumatic,Peptic_ulcer_disease,...,Cancer,LiverSevere,Metastatic_solid_tumour,HIV,Obesity,Depression,Hypertension,Drugs,Alcohol,Diabetes_Status
0,female,hispanic,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
1,male,white,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
2,female,white,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
3,female,white,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
4,female,hispanic,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [10]:
# ncode Categorical Columns (Gender and Race)
Pt_Analytics = pd.get_dummies(Pt_Analytics, columns=['Gender', 'Race'], drop_first=True)

In [11]:
Pt_Analytics.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Myocardial_infarction,20000.0,0.0456,0.208621,0.0,0.0,0.0,0.0,1.0
Congestive_heart_failure,20000.0,0.04345,0.203873,0.0,0.0,0.0,0.0,1.0
Peripheral_vascular_disease,20000.0,0.02395,0.152897,0.0,0.0,0.0,0.0,1.0
Stroke,20000.0,0.02865,0.166825,0.0,0.0,0.0,0.0,1.0
Dementia,20000.0,0.0314,0.174401,0.0,0.0,0.0,0.0,1.0
Pulmonary,20000.0,0.07265,0.259568,0.0,0.0,0.0,0.0,1.0
Rheumatic,20000.0,0.0123,0.110224,0.0,0.0,0.0,0.0,1.0
Peptic_ulcer_disease,20000.0,0.00965,0.097762,0.0,0.0,0.0,0.0,1.0
LiverMild,20000.0,0.00925,0.095733,0.0,0.0,0.0,0.0,1.0
Paralysis,20000.0,0.01355,0.115616,0.0,0.0,0.0,0.0,1.0


In [12]:
# Define Features (X) and Target (y)
X = Pt_Analytics.drop(columns=['Diabetes_Status'])
y = Pt_Analytics['Diabetes_Status']

In [13]:
# Train-Test Split (80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [14]:
# Standardize Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [16]:
# Evaluate Different Solvers
solvers = ['liblinear', 'lbfgs', 'sag', 'saga', 'newton-cg', 'newton-cholesky']
results = []

for solver in solvers:
    try:

        if solver == 'liblinear':
            model = LogisticRegression(
                solver=solver,
                penalty='l2',
                multi_class='ovr',
                max_iter=10000
            )
        else:
            model = LogisticRegression(
                solver=solver,
                penalty=None,
                multi_class='multinomial',
                max_iter=10000
            )

        start_time = time.time()
        model.fit(X_train_scaled, y_train)
        end_time = time.time()

        train_acc = accuracy_score(y_train, model.predict(X_train_scaled))
        test_acc = accuracy_score(y_test, model.predict(X_test_scaled))
        elapsed = round(end_time - start_time, 4)

        results.append({
            'Solver used': solver,
            'Training subset accuracy': round(train_acc, 4),
            'Holdout subset accuracy': round(test_acc, 4),
            'Time taken (s)': elapsed
        })

    except Exception as e:
        print(f"Solver '{solver}' failed: {e}")
        results.append({
            'Solver used': solver,
            'Training subset accuracy': 'Error',
            'Holdout subset accuracy': 'Error',
            'Time taken (s)': 'Error'
        })



In [19]:
# Display Results
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Solver used,Training subset accuracy,Holdout subset accuracy,Time taken (s)
0,liblinear,0.8626,0.8625,0.2411
1,lbfgs,0.8626,0.8625,0.1329
2,sag,0.8626,0.8625,0.9826
3,saga,0.8626,0.8625,0.6961
4,newton-cg,0.8626,0.8625,0.2089
5,newton-cholesky,0.8626,0.8625,0.384


Logistic Regression Solver Comparison
After running logistic regression using different solvers (liblinear, lbfgs, sag, saga, newton-cg, and newton-cholesky), I noticed something interesting: all of them gave me the exact same accuracy scores — both on the training set and the test (holdout) set.

Training accuracy: 86.26%

Test accuracy: 86.25%

This makes sense because all the solvers are working to optimize the same loss function — and since I didn’t apply any regularization, there was nothing to change the way each solver behaves. They were all given the same features, same split of the data (80/20), and the same scaling. Naturally, they all landed at the same solution.

So if performance (in terms of accuracy) is identical, I needed to look at how long each solver took to train. That’s where the real difference showed up.

From my results, lbfgs came out on top in terms of speed, finishing the training in just about 0.104 seconds. Other solvers like sag and newton-cg were a bit slower.

Conclusion
All solvers gave the same level of accuracy, but lbfgs was the fastest.
That’s why I’d consider lbfgs the best option here — it gives solid performance and is more efficient.