In [1]:
import pandas as pd

# Load CSV file
df = pd.read_csv("hdp_data.csv")

# Show basic info
print("📌 Data Overview:")
print(df.head())  # Show first 5 rows

print("\n📌 Column Info:")
print(df.info())  # Show column types and null values

print("\n📌 Summary Statistics:")
print(df.describe())  # Check numerical stats


📌 Data Overview:
   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope   
0   63    1   3       145   233    1        0      150      0      2.3      0  \
1   37    1   2       130   250    0        1      187      0      3.5      0   
2   41    0   1       130   204    0        0      172      0      1.4      2   
3   56    1   1       120   236    0        1      178      0      0.8      2   
4   57    0   0       120   354    0        1      163      1      0.6      2   

   ca  thal  target  
0   0     1       1  
1   0     2       1  
2   0     2       1  
3   0     2       1  
4   0     2       1  

📌 Column Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import joblib

# Load dataset (assuming 'df' is already loaded)
# df = pd.read_csv("your_data.csv")  # Uncomment if loading from a file

# Identify categorical and numerical columns
categorical_cols = ["sex", "cp", "fbs", "restecg", "exang", "slope", "ca", "thal"]
numerical_cols = ["age", "trestbps", "chol", "thalach", "oldpeak"]

# Split X (features) and y (target)
X = df.drop(columns=["target"])  # Features
y = df["target"]  # Target

# Convert categorical columns to integer dtype (for embedding layers)
X[categorical_cols] = X[categorical_cols].astype(int)

# Split data into train & test (80-20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Normalize numerical data separately
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[numerical_cols])
X_test_scaled = scaler.transform(X_test[numerical_cols])

# Convert back to DataFrame and reassign numerical values
X_train[numerical_cols] = X_train_scaled
X_test[numerical_cols] = X_test_scaled

# Save to CSV for verification
X_train.to_csv("X_train.csv", index=False)
X_test.to_csv("X_test.csv", index=False)
y_train.to_csv("y_train.csv", index=False)
y_test.to_csv("y_test.csv", index=False)

# Show shape of datasets
print("✅ Data Split & Scaling Completed")
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

# 𝗔𝗱𝗱 𝘁𝗵𝗶𝘀 𝘁𝗼 𝗦𝗔𝗩𝗘 𝘁𝗵𝗲 𝘀𝗰𝗮𝗹𝗲𝗿:
joblib.dump(scaler, 'scaler.pkl') 


✅ Data Split & Scaling Completed
X_train shape: (242, 13)
X_test shape: (61, 13)
y_train shape: (242,)
y_test shape: (61,)


['scaler.pkl']

In [None]:
import torch.optim as optim
import torch.nn as nn
import torch
import numpy as np
from collections import Counter


# Initialize model, loss function, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Get number of unique values for each categorical column
embedding_dims = {col: X_train[col].nunique() for col in categorical_cols}  

class TabTransformerGRU(nn.Module):
    def __init__(self, categorical_cols, numerical_cols, embedding_dims, hidden_size=64, output_size=1):
        super(TabTransformerGRU, self).__init__()

        # Create embeddings for categorical features
        self.embeddings = nn.ModuleList([
            nn.Embedding(num_categories, 10)  # Fixed embedding size of 10
            for num_categories in embedding_dims.values()
        ])  #dense vector representation

        # GRU Layer
        input_size = sum(10 for _ in embedding_dims) + len(numerical_cols)
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True)

        # Fully connected layer
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, X_cat, X_num):
        # Compute categorical embeddings
        embedded = [embed(X_cat[:, i]) for i, embed in enumerate(self.embeddings)]
        embedded = torch.cat(embedded, dim=-1)  # Shape: (batch_size, total_embedding_dim)

        # Ensure numerical features have correct shape
        if X_num.dim() == 1:
            X_num = X_num.unsqueeze(1)

        # Concatenate categorical embeddings with numerical features
        X_combined = torch.cat([embedded, X_num], dim=-1)  # Shape: (batch_size, input_size)

        # Reshape for GRU (batch_size, sequence_length=1, input_size)
        X_combined = X_combined.unsqueeze(1)

        # Pass through GRU
        _, hidden = self.gru(X_combined)

        # Fully connected output
        output = self.fc(hidden.squeeze(0))
        return output.squeeze(1)


# Create model
model = TabTransformerGRU(categorical_cols, numerical_cols, embedding_dims).to(device)

# Assuming y_train contains the class labels
class_counts = np.array([count for _, count in sorted(Counter(y_train).items())])

# Compute class weights
class_weights = torch.tensor(1.0 / class_counts, dtype=torch.float32)
class_weights = class_weights / class_weights.sum()  # Normalize
class_weights = class_weights.to(device)

# Define weighted loss function
loss_fn = nn.BCEWithLogitsLoss()



In [42]:
from torch.utils.data import DataLoader, TensorDataset

# Convert categorical and numerical data to tensors
X_train_cat = torch.tensor(X_train[categorical_cols].values, dtype=torch.long)
X_train_num = torch.tensor(X_train[numerical_cols].values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)  # Make it (batch_size, 1)

X_test_cat = torch.tensor(X_test[categorical_cols].values, dtype=torch.long)
X_test_num = torch.tensor(X_test[numerical_cols].values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1)

# Create DataLoader for batching
train_dataset = TensorDataset(X_train_cat, X_train_num, y_train_tensor)
test_dataset = TensorDataset(X_test_cat, X_test_num, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [43]:
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [44]:
from torch.utils.data import DataLoader, TensorDataset

# Convert categorical and numerical features to tensors
X_cat_tensor = torch.tensor(X_train[categorical_cols].values, dtype=torch.long).to(device)
X_num_tensor = torch.tensor(X_train[numerical_cols].values, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).to(device)

# Create dataset
train_dataset = TensorDataset(X_cat_tensor, X_num_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)


In [45]:
# Training loop
EPOCHS = 20
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    for X_cat_batch, X_num_batch, y_batch in train_loader:
        X_cat_batch, X_num_batch, y_batch = X_cat_batch.to(device), X_num_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()  # Reset gradients
        outputs = model(X_cat_batch, X_num_batch)  # Forward pass

        loss = loss_fn(outputs, y_batch)  # Compute loss

        loss.backward()  # Backpropagation
        optimizer.step()  # Update weights

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {avg_loss:.4f}")

print("Training Complete! 🎉")


# 2️⃣ Save the trained model after training is complete
torch.save(model.state_dict(), "mymodel.pkl")



print("Model saved successfully!")


Epoch 1/20, Loss: 0.6490
Epoch 2/20, Loss: 0.5619
Epoch 3/20, Loss: 0.4882
Epoch 4/20, Loss: 0.4357
Epoch 5/20, Loss: 0.3852
Epoch 6/20, Loss: 0.3755
Epoch 7/20, Loss: 0.3393
Epoch 8/20, Loss: 0.3312
Epoch 9/20, Loss: 0.3357
Epoch 10/20, Loss: 0.3196
Epoch 11/20, Loss: 0.3155
Epoch 12/20, Loss: 0.2958
Epoch 13/20, Loss: 0.2939
Epoch 14/20, Loss: 0.3119
Epoch 15/20, Loss: 0.2932
Epoch 16/20, Loss: 0.2881
Epoch 17/20, Loss: 0.2826
Epoch 18/20, Loss: 0.3090
Epoch 19/20, Loss: 0.2815
Epoch 20/20, Loss: 0.2757
Training Complete! 🎉
Model saved successfully!


In [46]:
model.eval()  # Set model to evaluation mode

X_cat_test_tensor = torch.tensor(X_test[categorical_cols].values, dtype=torch.long).to(device)
X_num_test_tensor = torch.tensor(X_test[numerical_cols].values, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).to(device)

with torch.no_grad():
    predictions = model(X_cat_test_tensor, X_num_test_tensor)
    predictions = torch.sigmoid(predictions)  # Convert to probabilities

# Convert probabilities to binary labels
predicted_labels = (predictions > 0.5).float()

# Calculate accuracy
accuracy = (predicted_labels == y_test_tensor).float().mean().item()
print(f"Test Accuracy: {accuracy:.4f}")


Test Accuracy: 0.8525


In [47]:
import torch

# Set model to evaluation mode
model.eval()

# Prepare the categorical and numerical test data
X_cat_test_tensor = torch.tensor(X_test[categorical_cols].values, dtype=torch.long).to(device)
X_num_test_tensor = torch.tensor(X_test[numerical_cols].values, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).to(device)

with torch.no_grad():
    predictions = model(X_cat_test_tensor, X_num_test_tensor)
    risk_probabilities = torch.sigmoid(predictions)  # Convert to probabilities

# Convert probabilities to binary labels
predicted_labels = (risk_probabilities >= 0.5).float()

# Calculate accuracy
accuracy = (predicted_labels == y_test_tensor).float().mean().item()
print(f"Test Accuracy: {accuracy:.4f}")

# Print individual risk predictions
for i, prob in enumerate(risk_probabilities):
    print(f"Sample {i+1}: Risk Probability = {prob.item():.4f}, Predicted Risk = {'High' if predicted_labels[i] else 'Low'}")


Test Accuracy: 0.8525
Sample 1: Risk Probability = 0.0217, Predicted Risk = Low
Sample 2: Risk Probability = 0.0447, Predicted Risk = Low
Sample 3: Risk Probability = 0.0137, Predicted Risk = Low
Sample 4: Risk Probability = 0.8626, Predicted Risk = High
Sample 5: Risk Probability = 0.1896, Predicted Risk = Low
Sample 6: Risk Probability = 0.0385, Predicted Risk = Low
Sample 7: Risk Probability = 0.9175, Predicted Risk = High
Sample 8: Risk Probability = 0.1933, Predicted Risk = Low
Sample 9: Risk Probability = 0.9835, Predicted Risk = High
Sample 10: Risk Probability = 0.5012, Predicted Risk = High
Sample 11: Risk Probability = 0.1369, Predicted Risk = Low
Sample 12: Risk Probability = 0.5872, Predicted Risk = High
Sample 13: Risk Probability = 0.0494, Predicted Risk = Low
Sample 14: Risk Probability = 0.9426, Predicted Risk = High
Sample 15: Risk Probability = 0.9632, Predicted Risk = High
Sample 16: Risk Probability = 0.7241, Predicted Risk = High
Sample 17: Risk Probability = 0.965

In [48]:
# Convert risk probabilities into binary labels
predicted_labels = (risk_probabilities >= 0.5).float()

# Compare predictions with actual values
correct_predictions = (predicted_labels == y_test_tensor).sum().item()
total_samples = y_test_tensor.shape[0]

# Compute Accuracy
accuracy = correct_predictions / total_samples
print(f"Manually Computed Accuracy: {accuracy:.4f}")


Manually Computed Accuracy: 0.8525


In [14]:
# Compare predicted vs actual risk labels
for i in range(len(y_test_tensor)):
    actual_label = "High" if y_test_tensor[i] >= 0.5 else "Low"
    predicted_label = "High" if predicted_labels[i] >= 0.5 else "Low"
    print(f"Sample {i+1}: Actual = {actual_label}, Predicted = {predicted_label}, Probability = {risk_probabilities[i].item():.4f}")


Sample 1: Actual = Low, Predicted = Low, Probability = 0.0231
Sample 2: Actual = Low, Predicted = Low, Probability = 0.0856
Sample 3: Actual = Low, Predicted = Low, Probability = 0.0139
Sample 4: Actual = Low, Predicted = High, Probability = 0.9112
Sample 5: Actual = Low, Predicted = Low, Probability = 0.3217
Sample 6: Actual = Low, Predicted = Low, Probability = 0.0587
Sample 7: Actual = High, Predicted = High, Probability = 0.9346
Sample 8: Actual = Low, Predicted = Low, Probability = 0.2496
Sample 9: Actual = High, Predicted = High, Probability = 0.9798
Sample 10: Actual = Low, Predicted = Low, Probability = 0.4503
Sample 11: Actual = High, Predicted = Low, Probability = 0.1489
Sample 12: Actual = High, Predicted = High, Probability = 0.6658
Sample 13: Actual = Low, Predicted = Low, Probability = 0.0605
Sample 14: Actual = High, Predicted = High, Probability = 0.9437
Sample 15: Actual = High, Predicted = High, Probability = 0.9607
Sample 16: Actual = High, Predicted = High, Probabil

In [87]:
print(y_test[:10])  # Print the first 10 values to verify
print(type(y_test))  # Check the type


179    0
197    0
285    0
194    0
188    0
240    0
160    1
167    0
136    1
228    0
Name: target, dtype: int64
<class 'pandas.core.series.Series'>


In [16]:
import torch


fullpath = "mymodel.pkl"

# Try loading the model with weights_only=False
model = torch.load(fullpath, weights_only=False)
print(model)


OrderedDict([('embeddings.0.weight', tensor([[-0.4665, -0.3981,  0.4015, -0.7564,  0.0261, -0.6146,  0.0256,  0.3997,
          0.3126, -3.5598],
        [ 0.4571, -0.0866,  0.6526, -2.6397, -0.7126,  0.1597, -0.4988,  0.7018,
         -0.1055, -1.3123]])), ('embeddings.1.weight', tensor([[ 0.7352,  2.1559, -0.6376,  0.8650, -0.2623,  0.7231,  0.8421,  0.5469,
         -1.0590,  1.7783],
        [ 0.4798,  1.7228, -0.6604,  1.1502,  1.2079, -0.8260, -1.1355, -0.7072,
          0.4641, -0.0668],
        [ 1.0848, -2.4661, -0.3843, -1.5720, -1.1986,  0.1252, -0.2154, -0.7849,
          0.7627,  0.8918],
        [-0.4974,  0.7022,  0.7682,  0.0310,  0.8382, -0.1459, -0.3161,  0.2317,
         -0.1350,  0.7918]])), ('embeddings.2.weight', tensor([[-0.2937, -0.4032,  3.0801, -0.4451,  1.2618, -0.1556,  0.1131, -0.5020,
          1.1422, -2.3514],
        [-0.6750, -1.1379,  1.4064, -1.5448,  0.0377, -0.4571, -0.2649,  0.0089,
          1.3361,  1.0939]])), ('embeddings.3.weight', tensor([[ 

In [3]:


# Add this to your training code
print("Unique values in 'ca':", df['ca'].unique())
print("Unique values in 'thal':", df['thal'].unique())

Unique values in 'ca': [0 2 1 3 4]
Unique values in 'thal': [1 2 3 0]
