In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.feature_selection import RFE

# Step 1: Read the heart dataset
heart_df = pd.read_csv('heart_disease (1).csv')  # Adjust the filename as per your dataset

# Step 2: Exploratory Data Analysis (EDA)
# Data Quality Check
print(heart_df.info())
print(heart_df.describe())

# Treat Missing Values if any
heart_df.dropna(inplace=True)  # Example: Drop rows with missing values, you might want a more sophisticated approach

# Step 3: Transform Categorical Data
# Example: Convert categorical columns to numerical using one-hot encoding
heart_df = pd.get_dummies(heart_df, columns=['gender', 'cp', 'restecg', 'slope', 'thal'])

# Step 4: Split the data into the train and test set
X = heart_df.drop('heart_diagnosis', axis=1)
y = heart_df['heart_diagnosis']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Undersample and oversample the data
# Undersample using RandomUnderSampler
undersampler = RandomUnderSampler(sampling_strategy='majority')
X_undersampled, y_undersampled = undersampler.fit_resample(X_train, y_train)

# Oversample using RandomOverSampler
oversampler = RandomOverSampler(sampling_strategy='minority')
X_oversampled, y_oversampled = oversampler.fit_resample(X_train, y_train)

# Step 6: Apply the Logistic Regression model on the skewed data, undersampled data, and oversampled data
# Logistic Regression on Skewed Data
model_skewed = LogisticRegression(random_state=42)
model_skewed.fit(X_train, y_train)

# Logistic Regression on Undersampled Data
model_undersampled = LogisticRegression(random_state=42)
model_undersampled.fit(X_undersampled, y_undersampled)

# Logistic Regression on Oversampled Data
model_oversampled = LogisticRegression(random_state=42)
model_oversampled.fit(X_oversampled, y_oversampled)

# Step 7: Print the model results
# Skewed Data
print("\nResults on Skewed Data:")
y_pred_skewed = model_skewed.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred_skewed))
print(classification_report(y_test, y_pred_skewed))

# Undersampled Data
print("\nResults on Undersampled Data:")
y_pred_undersampled = model_undersampled.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred_undersampled))
print(classification_report(y_test, y_pred_undersampled))

# Oversampled Data
print("\nResults on Oversampled Data:")
y_pred_oversampled = model_oversampled.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred_oversampled))
print(classification_report(y_test, y_pred_oversampled))

# Step 8: Get the feature importance
# Feature importance from the Skewed Data Model
print("\nFeature Importance from Skewed Data:")
print(model_skewed.coef_)

# Feature importance from the Undersampled Data Model
print("\nFeature Importance from Undersampled Data:")
print(model_undersampled.coef_)

# Feature importance from the Oversampled Data Model
print("\nFeature Importance from Oversampled Data:")
print(model_oversampled.coef_)

# Additional: You can use Recursive Feature Elimination (RFE) to get more refined feature importance
selector = RFE(model_oversampled, n_features_to_select=1)
selector.fit(X_oversampled, y_oversampled)

print("\nRFE Feature Ranking:")
for feature, rank in zip(X.columns, selector.ranking_):
    print(f'{feature}: {rank}')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              303 non-null    float64
 1   gender           303 non-null    float64
 2   cp               303 non-null    float64
 3   trestbps         303 non-null    float64
 4   chol             303 non-null    float64
 5   fbs              303 non-null    float64
 6   restecg          303 non-null    float64
 7   thalach          303 non-null    float64
 8   exang            303 non-null    float64
 9   oldpeak          303 non-null    float64
 10  slope            303 non-null    float64
 11  ca               299 non-null    float64
 12  thal             301 non-null    float64
 13  heart_diagnosis  303 non-null    int64  
dtypes: float64(13), int64(1)
memory usage: 33.3 KB
None
              age      gender          cp    trestbps        chol         fbs  \
count  303.000000  30

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


Results on Skewed Data:
Accuracy: 0.6166666666666667
              precision    recall  f1-score   support

           0       0.79      0.94      0.86        36
           1       0.14      0.11      0.12         9
           2       0.25      0.20      0.22         5
           3       0.20      0.14      0.17         7
           4       0.00      0.00      0.00         3

    accuracy                           0.62        60
   macro avg       0.28      0.28      0.27        60
weighted avg       0.54      0.62      0.57        60


Results on Undersampled Data:
Accuracy: 0.21666666666666667
              precision    recall  f1-score   support

           0       0.80      0.11      0.20        36
           1       0.16      0.78      0.26         9
           2       0.33      0.20      0.25         5
           3       0.17      0.14      0.15         7
           4       0.00      0.00      0.00         3

    accuracy                           0.22        60
   macro avg    

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


RFE Feature Ranking:
age: 20
trestbps: 22
chol: 23
fbs: 7
thalach: 21
exang: 8
oldpeak: 15
ca: 6
gender_0.0: 10
gender_1.0: 17
cp_1.0: 14
cp_2.0: 11
cp_3.0: 9
cp_4.0: 1
restecg_0.0: 2
restecg_1.0: 19
restecg_2.0: 12
slope_1.0: 5
slope_2.0: 18
slope_3.0: 4
thal_3.0: 13
thal_6.0: 16
thal_7.0: 3
