To use the Kaggle API, you need to install the `kaggle` library and configure your API credentials.

1.  **Install the `kaggle` library**:

In [None]:
!pip install kaggle



2.  **Obtain your Kaggle API credentials**:
    *   Go to your Kaggle account page (`https://www.kaggle.com/<your-username>/account`).
    *   Under the "API" section, click "Create New API Token". This will download a `kaggle.json` file containing your username and API key.

3.  **Upload your `kaggle.json` file to Colab**:
    *   In the Colab file explorer (folder icon on the left sidebar), click the "Upload to session storage" icon.
    *   Upload the `kaggle.json` file you downloaded.

4.  **Load your credentials**:
    *   Run the following code to load your API key and set up the necessary environment variables.

In [None]:
import os
from google.colab import files

# If you haven't uploaded kaggle.json yet, run this cell to upload it
if not os.path.exists("/root/.kaggle/kaggle.json"):
    os.makedirs("/root/.kaggle", exist_ok=True)
    uploaded = files.upload()
    for name, data in uploaded.items():
        with open("/root/.kaggle/kaggle.json", "wb") as f:
            f.write(data)
    os.chmod("/root/.kaggle/kaggle.json", 600)

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("redwankarimsony/heart-disease-data")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/heart-disease-data


In [None]:
import pandas as pd
import os

# The path variable from the previous cell contains the directory where the dataset was downloaded
dataset_path = path

# Assuming the dataset is in a CSV file within the downloaded directory,
# we need to find the CSV file.
# You might need to adjust the filename if it's different.
csv_files = [f for f in os.listdir(dataset_path) if f.endswith('.csv')]

if csv_files:
    # Assuming the first CSV file is the one you want to load
    dataset_file = os.path.join(dataset_path, csv_files[0])

    # Load the dataset into a pandas DataFrame
    df = pd.read_csv(dataset_file)

    # Display the column headers
    print("Column Headers:")
    print(df.columns.tolist())

    # Print the first few rows of the DataFrame
    print("\nFirst 5 rows of the dataset:")
    print(df.head())

else:
    print("No CSV files found in the dataset directory.")

Column Headers:
['id', 'age', 'sex', 'dataset', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalch', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num']

First 5 rows of the dataset:
   id  age     sex    dataset               cp  trestbps   chol    fbs  \
0   1   63    Male  Cleveland   typical angina     145.0  233.0   True   
1   2   67    Male  Cleveland     asymptomatic     160.0  286.0  False   
2   3   67    Male  Cleveland     asymptomatic     120.0  229.0  False   
3   4   37    Male  Cleveland      non-anginal     130.0  250.0  False   
4   5   41  Female  Cleveland  atypical angina     130.0  204.0  False   

          restecg  thalch  exang  oldpeak        slope   ca  \
0  lv hypertrophy   150.0  False      2.3  downsloping  0.0   
1  lv hypertrophy   108.0   True      1.5         flat  3.0   
2  lv hypertrophy   129.0   True      2.6         flat  2.0   
3          normal   187.0  False      3.5  downsloping  0.0   
4  lv hypertrophy   172.0  False      1.4    upsloping  0.

After completing these steps, you should be able to use the `kagglehub` or `kaggle` library to download datasets as you've attempted in your code.

In [None]:
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (644, 25)
Shape of X_test: (276, 25)
Shape of y_train: (644,)
Shape of y_test: (276,)


In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix # Import confusion_matrix and classification_report
from sklearn.preprocessing import StandardScaler
import numpy as np # Import numpy

# --- Data Loading (Assuming df is already loaded from previous cells) ---
# Make sure the DataFrame 'df' from the previous cell is available
if 'df' not in locals():
    print("Error: DataFrame 'df' not found. Please run the data loading cells first.")
else:
    # --- Prepare data for Logistic Regression ---
    # Use all columns except 'id', 'dataset', and 'num' as features
    features = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalch', 'exang', 'oldpeak', 'slope', 'ca', 'thal']
    X = df[features].copy()

    # Define the target variable 'num' and convert it to a binary classification
    # 0 for no disease, 1 for disease (num > 0)
    y = df['num'].apply(lambda x: 1 if x > 0 else 0)

    # Handle potential missing values in numerical features (simple imputation with mean)
    numerical_cols = X.select_dtypes(include=['number']).columns.tolist()
    X[numerical_cols] = X[numerical_cols].fillna(X[numerical_cols].mean())

    # Handle categorical features using one-hot encoding
    categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
    X = pd.get_dummies(X, columns=categorical_cols)

    # --- Scale Numerical Features ---
    all_cols = X.columns.tolist()
    scaled_cols = [col for col in all_cols if col not in [f for f in features if f in categorical_cols]]

    scaler = StandardScaler()
    X[scaled_cols] = scaler.fit_transform(X[scaled_cols])


    # Split data into training and testing sets (using the previously used 90/10 split)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


    # --- Logistic Regression Model ---
    # Initialize the Logistic Regression model
    model = LogisticRegression(max_iter=10000)

    # Train the model
    model.fit(X_train, y_train)

    # --- Adjust Classification Threshold ---
    # Get the probability of the positive class (class 1)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    # Define a custom threshold (e.g., 0.4 instead of the default 0.5)
    # You can experiment with different threshold values
    custom_threshold = 0.25

    # Make predictions based on the custom threshold
    y_pred_threshold = (y_pred_proba >= custom_threshold).astype(int)


    # Evaluate the model with the adjusted threshold
    print(f"Evaluating with Threshold: {custom_threshold}")
    accuracy_threshold = accuracy_score(y_test, y_pred_threshold)
    print(f"Accuracy: {accuracy_threshold}")

    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred_threshold))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred_threshold))

    # You can compare these results to the previous output (with the default threshold)
    # to see how the False Negatives have changed.

Evaluating with Threshold: 0.25
Accuracy: 0.8152173913043478
Confusion Matrix:
[[ 79  41]
 [ 10 146]]

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.66      0.76       120
           1       0.78      0.94      0.85       156

    accuracy                           0.82       276
   macro avg       0.83      0.80      0.80       276
weighted avg       0.83      0.82      0.81       276



In [None]:
# Import the RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np # Import numpy

# --- RandomForestClassifier Model ---
# Initialize the RandomForestClassifier model
# You can adjust parameters like n_estimators, max_depth, etc.
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# --- Adjust Classification Threshold ---
# Get the probability of the positive class (class 1)
y_pred_proba_rf = rf_model.predict_proba(X_test)[:, 1]

# Define a custom threshold (e.g., 0.4 instead of the default 0.5)
# You can experiment with different threshold values
custom_threshold_rf = 0.4

# Make predictions based on the custom threshold
y_pred_threshold_rf = (y_pred_proba_rf >= custom_threshold_rf).astype(int)

# Evaluate the model with the adjusted threshold
print(f"--- RandomForestClassifier Evaluation with Threshold: {custom_threshold_rf} ---")
accuracy_rf_threshold = accuracy_score(y_test, y_pred_threshold_rf)
print(f"Accuracy: {accuracy_rf_threshold}")

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_threshold_rf))

print("\nClassification Report:")
print(classification_report(y_test, y_pred_threshold_rf))

# You can compare these results to the previous output (with the default threshold)
# to see how the False Negatives have changed.

--- RandomForestClassifier Evaluation with Threshold: 0.4 ---
Accuracy: 0.8442028985507246

Confusion Matrix:
[[ 91  29]
 [ 14 142]]

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.76      0.81       120
           1       0.83      0.91      0.87       156

    accuracy                           0.84       276
   macro avg       0.85      0.83      0.84       276
weighted avg       0.85      0.84      0.84       276



In [None]:
# Import the Support Vector Classifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np # Import numpy

# --- Support Vector Machine (SVM) Model ---
# Initialize the SVC model
# You can adjust parameters like kernel, C, gamma, etc.
svm_model = SVC(probability=True, random_state=42) # probability=True allows predict_proba if needed later

# Train the model
svm_model.fit(X_train, y_train)

# --- Adjust Classification Threshold ---
# Get the probability of the positive class (class 1)
# Note: SVC with probability=True can be slower
y_pred_proba_svm = svm_model.predict_proba(X_test)[:, 1]

# Define a custom threshold (e.g., 0.4 instead of the default 0.5)
# You can experiment with different threshold values
custom_threshold_svm = 0.3

# Make predictions based on the custom threshold
y_pred_threshold_svm = (y_pred_proba_svm >= custom_threshold_svm).astype(int)

# Evaluate the model with the adjusted threshold
print(f"--- Support Vector Machine (SVM) Evaluation with Threshold: {custom_threshold_svm} ---")
accuracy_svm_threshold = accuracy_score(y_test, y_pred_threshold_svm)
print(f"Accuracy: {accuracy_svm_threshold}")

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_threshold_svm))

print("\nClassification Report:")
print(classification_report(y_test, y_pred_threshold_svm))

# You can compare these results to the previous output (with the default threshold)
# to see how the False Negatives have changed.

--- Support Vector Machine (SVM) Evaluation with Threshold: 0.3 ---
Accuracy: 0.8369565217391305

Confusion Matrix:
[[ 86  34]
 [ 11 145]]

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.72      0.79       120
           1       0.81      0.93      0.87       156

    accuracy                           0.84       276
   macro avg       0.85      0.82      0.83       276
weighted avg       0.84      0.84      0.83       276



In [None]:
# Import the K-Nearest Neighbors Classifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np # Import numpy for thresholding

# --- K-Nearest Neighbors (KNN) Model ---
# Initialize the KNN model
# You can adjust the number of neighbors (n_neighbors)
knn_model = KNeighborsClassifier(n_neighbors=5) # You can experiment with different values for n_neighbors

# Train the model
knn_model.fit(X_train, y_train)

# --- Adjust Classification Threshold ---
# Get the probability of the positive class (class 1)
# KNN predict_proba provides probabilities for each class
y_pred_proba_knn = knn_model.predict_proba(X_test)[:, 1]

# Define a custom threshold (e.g., 0.4 instead of the default 0.5)
custom_threshold_knn = 0.4 # You can experiment with different values

# Make predictions based on the custom threshold
y_pred_threshold_knn = (y_pred_proba_knn >= custom_threshold_knn).astype(int)


# Evaluate the model with the adjusted threshold
print(f"--- K-Nearest Neighbors (KNN) Evaluation with Threshold: {custom_threshold_knn} ---")
accuracy_knn_threshold = accuracy_score(y_test, y_pred_threshold_knn)
print(f"Accuracy: {accuracy_knn_threshold}")

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_threshold_knn))

print("\nClassification Report:")
print(classification_report(y_test, y_pred_threshold_knn))

# You can compare these results to the previous output (with the default threshold)
# to see how the False Negatives have changed.

--- K-Nearest Neighbors (KNN) Evaluation with Threshold: 0.4 ---
Accuracy: 0.8297101449275363

Confusion Matrix:
[[ 82  38]
 [  9 147]]

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.68      0.78       120
           1       0.79      0.94      0.86       156

    accuracy                           0.83       276
   macro avg       0.85      0.81      0.82       276
weighted avg       0.84      0.83      0.83       276



In [None]:
# Import the Gaussian Naive Bayes Classifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np # Import numpy

# --- Gaussian Naive Bayes Model ---
# Initialize the Gaussian Naive Bayes model
gnb_model = GaussianNB()

# Train the model
gnb_model.fit(X_train, y_train)

# --- Adjust Classification Threshold ---
# Get the probability of the positive class (class 1)
# GaussianNB predict_proba provides probabilities for each class
y_pred_proba_gnb = gnb_model.predict_proba(X_test)[:, 1]

# Define a custom threshold (e.g., 0.4 instead of the default 0.5)
custom_threshold_gnb = 0.01 # You can experiment with different values

# Make predictions based on the custom threshold
y_pred_threshold_gnb = (y_pred_proba_gnb >= custom_threshold_gnb).astype(int)


# Evaluate the model with the adjusted threshold
print(f"--- Gaussian Naive Bayes Evaluation with Threshold: {custom_threshold_gnb} ---")
accuracy_gnb_threshold = accuracy_score(y_test, y_pred_threshold_gnb)
print(f"Accuracy: {accuracy_gnb_threshold}")

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_threshold_gnb))

print("\nClassification Report:")
print(classification_report(y_test, y_pred_threshold_gnb))

# You can compare these results to the previous output (with the default threshold)
# to see how the False Negatives have changed.

--- Gaussian Naive Bayes Evaluation with Threshold: 0.01 ---
Accuracy: 0.8297101449275363

Confusion Matrix:
[[ 89  31]
 [ 16 140]]

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.74      0.79       120
           1       0.82      0.90      0.86       156

    accuracy                           0.83       276
   macro avg       0.83      0.82      0.82       276
weighted avg       0.83      0.83      0.83       276

