In [1]:
import pandas as pd

In [2]:
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
model = LogisticRegression()

In [4]:
df1 = pd.read_csv('df_dropped_values.csv')
df2 = pd.read_csv('df_imputated_values.csv')

In [5]:
df1.dtypes.unique()

array([dtype('float64'), dtype('int64'), dtype('bool')], dtype=object)

In [6]:
df2.dtypes.unique()

array([dtype('float64'), dtype('int64'), dtype('bool')], dtype=object)

There are boolean values due to one-hot encoding so first we will treat them

In [7]:
df1[df1.select_dtypes(bool).columns] = df1.select_dtypes(bool).astype(int)


In [8]:
df2[df2.select_dtypes(bool).columns] = df2.select_dtypes(bool).astype(int)


Our dataset is very big so I will just use it's sample to do our model training 

In [9]:
df1_reduced = df1.sample(n=25000, random_state=42)
df2_reduced = df2.sample(n=25000, random_state=42)

X1 = df1_reduced.drop(columns=['hospital_death'])
y1 = df1_reduced['hospital_death']

X2 = df2_reduced.drop(columns=['hospital_death'])
y2 = df2_reduced['hospital_death']


## I will apply the scaling and normalization in these cells

In [10]:
from sklearn.preprocessing import RobustScaler
import numpy as np

# Apply robust scaling (handles outliers better than StandardScaler)
scaler = RobustScaler()
X1 = scaler.fit_transform(X1)
X2 = scaler.transform(X2)

# Clip extreme values to avoid overflow
X1 = np.clip(X1, -10, 10)
X2 = np.clip(X2, -10, 10)


In [11]:
from sklearn.preprocessing import Normalizer


normalizer = Normalizer()  

X1 = normalizer.fit_transform(X1)
X2 = normalizer.transform(X2)


In [12]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, random_state=42)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=42)



### Logistic Regression Model Overview

Logistic Regression is a supervised classification algorithm used to predict the probability of a binary outcome. It models the relationship between input features and the log-odds of the target class using a linear combination of the features. The model estimates coefficients (weights) for each feature and an intercept term.

Key aspects include:

* **Solver:** Optimization algorithm used to find the best-fitting coefficients (e.g., `liblinear`, `saga`). Choice depends on dataset size and penalty type.
* **Penalty (Regularization):** Controls model complexity to prevent overfitting. Common penalties include L2 (Ridge) regularization.
* **Regularization strength (`C`):** Inverse of regularization strength. Smaller values specify stronger regularization.
* **Max Iterations (`max_iter`):** Number of optimization steps allowed for convergence.
* **Model Outputs:**

  * Coefficients and intercept define the decision boundary.
  * Predicted probabilities for class membership.
  * Classification accuracy on training and test data measures performance.

Logistic Regression is effective for binary classification tasks, especially when the relationship between features and output is approximately linear in the log-odds space.




In [13]:
clf = LogisticRegression(
    solver="liblinear",   
    penalty="l2",
    C=0.05,               
    max_iter=10000,
    tol=1e-3
)
clf.fit(X1, y1)

print("coef finite?", np.isfinite(clf.coef_).all(), "intercept finite?", np.isfinite(clf.intercept_).all())
print("Train acc:", clf.score(X1, y1))
print("Test  acc:",  clf.score(X2, y2))


coef finite? True intercept finite? True
Train acc: 0.91812
Test  acc: 0.91952


In [14]:
from sklearn.linear_model import LogisticRegression

model2 = LogisticRegression(solver='saga', max_iter=500, C=1.0, penalty='l2')
model2.fit(X2_train, y2_train)
# Accuracy on training data
train_accuracy = model2.score(X2_train, y2_train)
print("Train accuracy:", train_accuracy)

# Accuracy on test data (if you have X2_test, y2_test)
test_accuracy = model2.score(X2_test, y2_test)
print("Test accuracy:", test_accuracy)



Train accuracy: 0.9209
Test accuracy: 0.9294
