In [2]:
import pandas as pd
from google.colab import drive # Import 'drive' module from the 'google.colab' library
drive.mount('/content/drive') # This mounts your Google Drive into the Colab virtual environment.

file_path = '/content/drive/MyDrive/healthcare-dataset-stroke-data.csv'
df = pd.read_csv(file_path)

print(df.head())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
      id  gender   age  hypertension  heart_disease ever_married  \
0   9046    Male  67.0             0              1          Yes   
1  51676  Female  61.0             0              0          Yes   
2  31112    Male  80.0             0              1          Yes   
3  60182  Female  49.0             0              0          Yes   
4   1665  Female  79.0             1              0          Yes   

       work_type Residence_type  avg_glucose_level   bmi   smoking_status  \
0        Private          Urban             228.69  36.6  formerly smoked   
1  Self-employed          Rural             202.21   NaN     never smoked   
2        Private          Rural             105.92  32.5     never smoked   
3        Private          Urban             171.23  34.4           smokes   
4  Self-employed          Rural             174.12  24.0     never smoked   


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

# Step 1: Load dataset


# Step 2: Drop irrelevant column
df.drop(columns=['id'], inplace=True)

# Step 3: Handle missing values (bmi)
df['bmi'].fillna(df['bmi'].mean(), inplace=True)

# Step 4: Encode binary categorical columns using Label Encoding
binary_cols = ['gender', 'ever_married', 'Residence_type']
le = LabelEncoder()
for col in binary_cols:
    df[col] = le.fit_transform(df[col])

# Step 5: One-hot encode multi-class categorical columns
df = pd.get_dummies(df, columns=['work_type', 'smoking_status'], drop_first=True)

# Step 6: Separate features and target
X = df.drop('stroke', axis=1)
y = df['stroke']

# Step 7: Scale numerical features
scaler = StandardScaler()
num_cols = ['age', 'avg_glucose_level', 'bmi']
X[num_cols] = scaler.fit_transform(X[num_cols])

# Step 8: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Step 9: Apply Logistic Regression
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Step 10: Make predictions
y_pred = model.predict(X_test)

# Step 11: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)

# Step 12: Print results
print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)


Accuracy: 0.952054794520548
Confusion Matrix:
 [[972   0]
 [ 49   1]]
Precision: 1.0
Recall: 0.02
F1-Score: 0.0392156862745098


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['bmi'].fillna(df['bmi'].mean(), inplace=True)


In [10]:
import pandas as pd
from google.colab import drive # Import 'drive' module from the 'google.colab' library
drive.mount('/content/drive') # This mounts your Google Drive into the Colab virtual environment.

file_path = '/content/drive/MyDrive/healthcare-dataset-stroke-data.csv'
df_raw = pd.read_csv(file_path)


try:
    # Step 1: Load raw dataset again


    # Step 2: Separate features and target without preprocessing
    X_raw = df_raw.drop('stroke', axis=1)
    y_raw = df_raw['stroke']

    # Step 3: Split data
    X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(
        X_raw, y_raw, test_size=0.2, random_state=42, stratify=y_raw
    )

    # Step 4: Train model directly
    model_raw = LogisticRegression(max_iter=1000)
    model_raw.fit(X_train_raw, y_train_raw)  # This will raise error
    y_pred_raw = model_raw.predict(X_test_raw)

    # Step 5: Evaluate
    print("Accuracy:", accuracy_score(y_test_raw, y_pred_raw))
    print("Confusion Matrix:\n", confusion_matrix(y_test_raw, y_pred_raw))
    print("Precision:", precision_score(y_test_raw, y_pred_raw, zero_division=0))
    print("Recall:", recall_score(y_test_raw, y_pred_raw, zero_division=0))
    print("F1-Score:", f1_score(y_test_raw, y_pred_raw, zero_division=0))

except Exception as e:
    print("Error occurred while training without preprocessing:")
    print(e)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Error occurred while training without preprocessing:
could not convert string to float: 'Female'


Conclusion:
Applying Logistic Regression without preprocessing resulted in an error due to the presence of missing values and non-numeric categorical data, which the model could not handle. On the other hand, the preprocessed model successfully ran and delivered meaningful performance metrics like Accuracy: 0.952054794520548
Confusion Matrix:
 [[972   0]
 [ 49   1]]
Precision: 1.0
Recall: 0.02
F1-Score: 0.0392156862745098