In [3]:
import pandas as pd

data_file = "processed.cleveland.data"
csv_file = "heart_disease.csv"

columns = [
    "age", "sex", "cp", "trestbps", "chol", "fbs", "restecg",
    "thalach", "exang", "oldpeak", "slope", "ca", "thal", "Target"
]

df = pd.read_csv(data_file, header=None, names=columns)

df.to_csv(csv_file, index=False)
print(f"Converted {data_file} to {csv_file}")

Converted processed.cleveland.data to heart_disease.csv


In [14]:
import pandas as pd

# Load the dataset
file_path = 'heart_disease.csv'  
dataset = pd.read_csv(file_path)

# Step 1: Handle missing values
dataset['ca'] = pd.to_numeric(dataset['ca'], errors='coerce')
dataset['thal'] = pd.to_numeric(dataset['thal'], errors='coerce')
dataset.fillna(dataset.median(numeric_only=True), inplace=True)

# Step 2: Encode categorical variables
dataset['sex'] = dataset['sex'].map({1: 'Male', 0: 'Female'})

# Step 3: Feature engineering - Categorize age
dataset['age_group'] = pd.cut(dataset['age'], bins=[0, 40, 60, 100], labels=['Young', 'Middle-aged', 'Elderly'])

# Step 4: Rename columns
dataset.rename(columns={
    'cp': 'chest_pain_type',
    'thalach': 'max_heart_rate',
    'num': 'heart_disease_presence'
}, inplace=True)

# Simplify 'heart_disease_presence' column
dataset['Target'] = dataset['Target'].apply(lambda x: 0 if x == 0 else 1)

# Save the cleaned dataset
cleaned_file_path = 'heart_disease_cleaned_enhanced.csv'  # Replace with your desired save path
dataset.to_csv(cleaned_file_path, index=False)
print(f"Enhanced cleaned dataset saved to: {cleaned_file_path}")

Enhanced cleaned dataset saved to: heart_disease_cleaned_enhanced.csv


In [15]:
import pandas as pd

# Load the dataset
file_path = 'heart_disease_cleaned_enhanced.csv' 
heart_data = pd.read_csv(file_path)

# Mappings for categorical variables
mappings = {
    "chest_pain_type": {
        1: "Typical Angina",
        2: "Atypical Angina",
        3: "Non-Anginal Pain",
        4: "Asymptomatic"
    },
    "restecg": {
        0: "Normal",
        1: "ST-T Abnormality",
        2: "Left Ventricular Hypertrophy"
    },
    "thal": {
        3: "Normal",
        6: "Fixed Defect",
        7: "Reversible Defect"
    }
}


#  mappings to create readable columns
heart_data['cp_readable'] = heart_data['chest_pain_type'].map(mappings['chest_pain_type'])
heart_data['restecg_readable'] = heart_data['restecg'].map(mappings['restecg'])
heart_data['thal_readable'] = heart_data['thal'].map(mappings['thal'])

# Convert 'Target' to "Absence" and "Presence"
heart_data['Target_readable'] = heart_data['Target'].apply(lambda x: 'Absence' if x == 0 else 'Presence')

# Save the updated dataset to a new CSV file
output_file_path = 'heart_disease_for_tableau_final.csv'
heart_data.to_csv(output_file_path, index=False)

print(f"Updated dataset saved to {output_file_path}")

Updated dataset saved to heart_disease_for_tableau_final.csv


In [6]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
file_path = 'heart_disease_cleaned_enhanced.csv'  # Replace with your file path
dataset = pd.read_csv(file_path)

# Define a function for outlier detection using IQR
def detect_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)  # First quartile
    Q3 = data[column].quantile(0.75)  # Third quartile
    IQR = Q3 - Q1  # Interquartile range
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    return outliers

# Continuous variables to analyze
columns_to_check = ['trestbps', 'chol', 'thal', 'oldpeak']  


for col in columns_to_check:
    outliers = detect_outliers_iqr(dataset, col)
    print(f"Outliers detected in {col}:\n", outliers)



Outliers detected in trestbps:
       age     sex  chest_pain_type  trestbps   chol  fbs  restecg  \
14   52.0    Male              3.0     172.0  199.0  1.0      0.0   
83   68.0    Male              3.0     180.0  274.0  1.0      2.0   
126  56.0  Female              4.0     200.0  288.0  1.0      2.0   
172  59.0  Female              4.0     174.0  249.0  0.0      0.0   
183  59.0    Male              1.0     178.0  270.0  0.0      2.0   
188  54.0    Male              2.0     192.0  283.0  0.0      2.0   
201  64.0  Female              4.0     180.0  325.0  0.0      0.0   
213  66.0  Female              4.0     178.0  228.0  1.0      0.0   
231  55.0  Female              4.0     180.0  327.0  0.0      1.0   

     max_heart_rate  exang  oldpeak  slope   ca  thal  Target    age_group  
14            162.0    0.0      0.5    1.0  0.0   7.0       0  Middle-aged  
83            150.0    1.0      1.6    2.0  0.0   7.0       1      Elderly  
126           133.0    1.0      4.0    3.0  2.

In [5]:
# Summary statistics
print("Summary statistics:")
print(dataset.describe())  
print("\nMode of each column:")
print(dataset.mode()) 

# Check for missing values
missing_values = dataset.isnull().sum()
print("\nMissing values in each column:")
print(missing_values)



Summary statistics:
              age  chest_pain_type    trestbps        chol         fbs  \
count  303.000000       303.000000  303.000000  303.000000  303.000000   
mean    54.438944         3.158416  131.689769  246.693069    0.148515   
std      9.038662         0.960126   17.599748   51.776918    0.356198   
min     29.000000         1.000000   94.000000  126.000000    0.000000   
25%     48.000000         3.000000  120.000000  211.000000    0.000000   
50%     56.000000         3.000000  130.000000  241.000000    0.000000   
75%     61.000000         4.000000  140.000000  275.000000    0.000000   
max     77.000000         4.000000  200.000000  564.000000    1.000000   

          restecg  max_heart_rate       exang     oldpeak       slope  \
count  303.000000      303.000000  303.000000  303.000000  303.000000   
mean     0.990099      149.607261    0.326733    1.039604    1.600660   
std      0.994971       22.875003    0.469794    1.161075    0.616226   
min      0.000000    