<a href="https://colab.research.google.com/github/Ay1932/Machine-Learning/blob/main/Heart_dieases.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Load the dataset of Heart attack prediction**

In [1]:
import pandas as pd

# Load the dataset
file_path = "/content/drive/MyDrive/heart_attack_prediction_dataset.csv"
df = pd.read_csv(file_path)

# Display first few rows
print(df.head())

# Check column names & data types
print(df.info())

# Check for missing values
print(df.isnull().sum())


  Patient ID  Age     Sex  Cholesterol Blood Pressure  Heart Rate  Diabetes  \
0    BMW7812   67    Male          208         158/88          72         0   
1    CZE1114   21    Male          389         165/93          98         1   
2    BNI9906   21  Female          324         174/99          72         1   
3    JLN3497   84    Male          383        163/100          73         1   
4    GFO8847   66    Male          318          91/88          93         1   

   Family History  Smoking  Obesity  ...  Sedentary Hours Per Day  Income  \
0               0        1        0  ...                 6.615001  261404   
1               1        1        1  ...                 4.963459  285768   
2               0        0        0  ...                 9.463426  235282   
3               1        1        0  ...                 7.648981  125640   
4               1        1        1  ...                 1.514821  160555   

         BMI  Triglycerides  Physical Activity Days Per Week  

**Preprocessing the data**

In [2]:
from sklearn.preprocessing import LabelEncoder

# Drop unnecessary columns
df = df.drop(columns=["Patient ID", "Country", "Continent", "Hemisphere"])

# Split 'Blood Pressure' into 'Systolic BP' and 'Diastolic BP'
df[['Systolic BP', 'Diastolic BP']] = df["Blood Pressure"].str.split("/", expand=True).astype(float)

# Drop the original 'Blood Pressure' column
df = df.drop(columns=["Blood Pressure"])

# Encode categorical columns
categorical_columns = ["Sex", "Diet"]
encoder = LabelEncoder()
for col in categorical_columns:
    df[col] = encoder.fit_transform(df[col])

# Confirm data preprocessing
print(df.head())


   Age  Sex  Cholesterol  Heart Rate  Diabetes  Family History  Smoking  \
0   67    1          208          72         0               0        1   
1   21    1          389          98         1               1        1   
2   21    0          324          72         1               0        0   
3   84    1          383          73         1               1        1   
4   66    1          318          93         1               1        1   

   Obesity  Alcohol Consumption  Exercise Hours Per Week  ...  Stress Level  \
0        0                    0                 4.168189  ...             9   
1        1                    1                 1.813242  ...             1   
2        0                    0                 2.078353  ...             9   
3        0                    1                 9.828130  ...             9   
4        1                    0                 5.804299  ...             6   

   Sedentary Hours Per Day  Income        BMI  Triglycerides  \
0         

**Define X and Y axis**

In [3]:
# Define target variable
target_column = "Heart Attack Risk"

# Define features (X) and target (y)
X = df.drop(columns=[target_column])
y = df[target_column]


**After defnning Spilting data into trained and test data**

In [4]:
from sklearn.model_selection import train_test_split

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Print dataset sizes
print("Training Set:", X_train.shape, y_train.shape)
print("Testing Set:", X_test.shape, y_test.shape)


Training Set: (7010, 22) (7010,)
Testing Set: (1753, 22) (1753,)


In [5]:
# Check data types of all features
print(X_train.dtypes)


Age                                  int64
Sex                                  int64
Cholesterol                          int64
Heart Rate                           int64
Diabetes                             int64
Family History                       int64
Smoking                              int64
Obesity                              int64
Alcohol Consumption                  int64
Exercise Hours Per Week            float64
Diet                                 int64
Previous Heart Problems              int64
Medication Use                       int64
Stress Level                         int64
Sedentary Hours Per Day            float64
Income                               int64
BMI                                float64
Triglycerides                        int64
Physical Activity Days Per Week      int64
Sleep Hours Per Day                  int64
Systolic BP                        float64
Diastolic BP                       float64
dtype: object


In [6]:
from sklearn.preprocessing import LabelEncoder

# Find categorical columns
categorical_columns = X_train.select_dtypes(include=["object"]).columns

# Apply Label Encoding
encoder = LabelEncoder()
for col in categorical_columns:
    X_train[col] = encoder.fit_transform(X_train[col])
    X_test[col] = encoder.transform(X_test[col])

**Training the model**
1. Logistic regression

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Compute class weights to handle imbalance
class_weights = compute_class_weight("balanced", classes=np.unique(y), y=y)
class_weight_dict = {0: class_weights[0], 1: class_weights[1]}

# Initialize and train the model
model = LogisticRegression(class_weight=class_weight_dict, random_state=42, max_iter=500)
model.fit(X_train, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
# Find non-numeric columns
non_numeric_columns = X_train.select_dtypes(include=["object"]).columns
print("Non-numeric columns:", non_numeric_columns)


Non-numeric columns: Index([], dtype='object')


In [9]:
from sklearn.preprocessing import LabelEncoder

# Encode categorical features
encoder = LabelEncoder()
for col in non_numeric_columns:
    X_train[col] = encoder.fit_transform(X_train[col])
    X_test[col] = encoder.transform(X_test[col])


In [10]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Compute class weights
class_weights = compute_class_weight("balanced", classes=np.unique(y_train), y=y_train)
class_weight_dict = {0: class_weights[0], 1: class_weights[1]}

# Train Logistic Regression with scaled data
model = LogisticRegression(class_weight=class_weight_dict, random_state=42, max_iter=1000)
model.fit(X_train_scaled, y_train)
