In [None]:
1. Tasks to Perform
a.Load the Dataset
○ Check for missing or inconsistent data.
○ Clean the dataset by handling null values and duplicate


In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Load the dataset
file_path = r"C:\Users\Ashish\OneDrive\Desktop\Jupyter Notebook\Internship task 6\heart_disease.xlsx"
df = pd.read_excel(file_path)

# Display the first few rows
print("First 5 rows of the dataset:")
display(df.head())

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Handle missing values (e.g., filling with mean for numerical columns)
df.fillna(df.mean(numeric_only=True), inplace=True)

# Check for duplicate rows
duplicate_rows = df.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicate_rows}")

# Remove duplicate rows if any
df.drop_duplicates(inplace=True)

# Display dataset info after cleaning
print("\nDataset Info After Cleaning:")
print(df.info())

# Save the cleaned dataset (optional)
cleaned_file_path = r"C:\Users\Ashish\OneDrive\Desktop\Jupyter Notebook\Internship task 6\cleaned_heart_disease.xlsx"
df.to_excel(cleaned_file_path, index=False)
print("\nCleaned dataset saved successfully!")


First 5 rows of the dataset:


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1



Missing Values:
age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

Number of duplicate rows: 1

Dataset Info After Cleaning:
<class 'pandas.core.frame.DataFrame'>
Index: 302 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       302 non-null    int64  
 1   sex       302 non-null    int64  
 2   cp        302 non-null    int64  
 3   trestbps  302 non-null    int64  
 4   chol      302 non-null    int64  
 5   fbs       302 non-null    int64  
 6   restecg   302 non-null    int64  
 7   thalach   302 non-null    int64  
 8   exang     302 non-null    int64  
 9   oldpeak   302 non-null    float64
 10  slope     302 non-null    int64  
 11  ca        302 non-null    int64  
 12  thal      302 non-null    int64  
 13  target    302 non-null   

In [None]:
b. Feature Engineering
○ Normalize or scale numerical features like Age, Cholesterol, and Blood
Pressure to improve model performance.

In [4]:
# Import necessary library
import pandas as pd

# Load the dataset (assuming df is already loaded)
file_path = r"C:\Users\Ashish\OneDrive\Desktop\Jupyter Notebook\Internship task 6\cleaned_heart_disease.xlsx"
df = pd.read_excel(file_path)

# Define numerical features
numerical_features = ["Age", "Cholesterol", "Blood Pressure"]

# Min-Max Scaling (scales values between 0 and 1)
def min_max_scaling(column):
    return (column - column.min()) / (column.max() - column.min())

# Apply Min-Max Scaling to numerical features
for feature in numerical_features:
    df[feature] = min_max_scaling(df[feature])

# Display scaled features
print("\nScaled Numerical Features:")
display(df.head())

# Save the processed dataset
processed_file_path = r"C:\Users\Ashish\OneDrive\Desktop\Jupyter Notebook\Internship task 6\processed_heart_disease.xlsx"
df.to_excel(processed_file_path, index=False)
print("\nProcessed dataset with scaled features saved successfully!")



Scaled Numerical Features:


Unnamed: 0,Age,sex,cp,Blood Pressure,Cholesterol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,0.708333,1,3,0.481132,0.244292,1,0,150,0,2.3,0,0,1,1
1,0.166667,1,2,0.339623,0.283105,0,1,187,0,3.5,0,0,2,1
2,0.25,0,1,0.339623,0.178082,0,0,172,0,1.4,2,0,2,1
3,0.5625,1,1,0.245283,0.251142,0,1,178,0,0.8,2,0,2,1
4,0.583333,0,0,0.245283,0.520548,0,1,163,1,0.6,2,0,2,1



Processed dataset with scaled features saved successfully!


In [None]:
c. Model Training
○ Train a Logistic Regression model to classify patients as having heart disease
or not.

In [10]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the cleaned dataset
file_path = r"C:\Users\Ashish\OneDrive\Desktop\Jupyter Notebook\Internship task 6\processed_heart_disease.xlsx"
df = pd.read_excel(file_path)

# Print column names to identify the correct target column
print("Dataset Columns:", df.columns)

# Replace with the actual column name for heart disease
target_column = "Diagnosis"  # Change this if your column name is different

# Convert categorical target values (if applicable)
if df[target_column].dtype == 'object':  # If the column contains "Yes"/"No"
    df[target_column] = df[target_column].map({"Yes": 1, "No": 0})

# Define features (X) and target (y)
X = df.drop(columns=[target_column])  # Features
y = df[target_column]  # Target (1 = Disease, 0 = No Disease)

# Split data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on test data
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Print confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Dataset Columns: Index(['Age', 'sex', 'cp', 'Blood Pressure', 'Cholesterol', 'fbs', 'restecg',
       'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object')


KeyError: 'Diagnosis'

In [None]:
d. Model Evaluation
○ Assess model accuracy using a confusion matrix and classification metrics like:
■ Precision
■ Recall
■ F1-score

In [14]:
import numpy as np
import pandas as pd

# Load dataset
file_path = r"C:\Users\Ashish\OneDrive\Desktop\Jupyter Notebook\Internship task 6\heart_disease.xlsx"
df = pd.read_excel(file_path)

# Clean column names (remove extra spaces)
df.columns = df.columns.str.strip()

# Print column names to check available columns
print("Available columns:", df.columns.tolist())

# Choose an existing target column
target_column = df.columns[-1]  # Assuming last column is the target (modify if needed)
print(f"Using target column: {target_column}")

# Define features (X) and target (y)
X = df.drop(columns=[target_column]).values  # Convert to NumPy array
y = df[target_column].values.reshape(-1, 1)  # Convert to column vector

# Normalize numerical features (Min-Max scaling)
X = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))

# Add bias term (column of 1s) for the intercept
X = np.c_[np.ones((X.shape[0], 1)), X]

# Split into training and testing sets (80% train, 20% test)
split_index = int(len(X) * 0.8)
X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

# Initialize weights
theta = np.zeros((X_train.shape[1], 1))

# Sigmoid function
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# Train using Gradient Descent
def train(X, y, theta, lr=0.01, epochs=5000):
    m = len(y)
    for _ in range(epochs):
        h = sigmoid(X @ theta)
        gradient = (X.T @ (h - y)) / m
        theta -= lr * gradient
    return theta

# Train model
theta = train(X_train, y_train, theta)

# Predict
def predict(X, theta):
    return (sigmoid(X @ theta) >= 0.5).astype(int)

y_pred = predict(X_test, theta)

# Model evaluation (Accuracy)
accuracy = np.mean(y_pred == y_test)
print(f"\n✅ Model Accuracy: {accuracy:.4f}")


Available columns: ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']
Using target column: target

✅ Model Accuracy: 0.5902


In [None]:
2. Deliverables
● Logistic Regression Model: Trained model to predict heart disease.
● Evaluation Report:
○ Confusion matrix
○ Accuracy, precision, recall, and F1-score insight

In [15]:
import numpy as np
import pandas as pd

# Load dataset
file_path = r"C:\Users\Ashish\OneDrive\Desktop\Jupyter Notebook\Internship task 6\heart_disease.xlsx"
df = pd.read_excel(file_path)

# Clean column names (remove extra spaces)
df.columns = df.columns.str.strip()

# Print column names to check available columns
print("Available columns:", df.columns.tolist())

# Choose an existing target column
target_column = df.columns[-1]  # Assuming last column is the target (modify if needed)
print(f"Using target column: {target_column}")

# Define features (X) and target (y)
X = df.drop(columns=[target_column]).values  # Convert to NumPy array
y = df[target_column].values.reshape(-1, 1)  # Convert to column vector

# Normalize numerical features (Min-Max scaling)
X = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))

# Add bias term (column of 1s) for the intercept
X = np.c_[np.ones((X.shape[0], 1)), X]

# Split into training and testing sets (80% train, 20% test)
split_index = int(len(X) * 0.8)
X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

# Initialize weights
theta = np.zeros((X_train.shape[1], 1))

# Sigmoid function
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# Train using Gradient Descent
def train(X, y, theta, lr=0.01, epochs=5000):
    m = len(y)
    for _ in range(epochs):
        h = sigmoid(X @ theta)
        gradient = (X.T @ (h - y)) / m
        theta -= lr * gradient
    return theta

# Train model
theta = train(X_train, y_train, theta)

# Predict function
def predict(X, theta):
    return (sigmoid(X @ theta) >= 0.5).astype(int)

y_pred = predict(X_test, theta)

# Confusion Matrix Calculation
def confusion_matrix(y_true, y_pred):
    TP = np.sum((y_true == 1) & (y_pred == 1))
    TN = np.sum((y_true == 0) & (y_pred == 0))
    FP = np.sum((y_true == 0) & (y_pred == 1))
    FN = np.sum((y_true == 1) & (y_pred == 0))
    return np.array([[TP, FP], [FN, TN]])

cm = confusion_matrix(y_test, y_pred)

# Accuracy, Precision, Recall, F1-Score
TP, FP = cm[0, 0], cm[0, 1]
FN, TN = cm[1, 0], cm[1, 1]

accuracy = (TP + TN) / (TP + TN + FP + FN)
precision = TP / (TP + FP) if (TP + FP) > 0 else 0
recall = TP / (TP + FN) if (TP + FN) > 0 else 0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

# Print results
print("\n✅ Model Evaluation Results")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1_score:.4f}")
print("\nConfusion Matrix:")
print(cm)


Available columns: ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']
Using target column: target

✅ Model Evaluation Results
Accuracy: 0.5902
Precision: 0.0000
Recall: 0.0000
F1-Score: 0.0000

Confusion Matrix:
[[ 0 25]
 [ 0 36]]
