<a href="https://colab.research.google.com/github/Brynlai/DataScienceHeartDiseaseAssignment/blob/Bryan/DataScienceAssignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# @title
!pip install ucimlrepo
!pip install pandas matplotlib seaborn scikit-learn

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ucimlrepo import fetch_ucirepo
from IPython.display import display, HTML
from scipy.stats import zscore

# CHANGING
# Fetch dataset
heart_disease_bunch = fetch_ucirepo(id=45)

print(heart_disease_bunch)

In [None]:
# Load into DataFrame
heart_disease = pd.DataFrame(data=heart_disease_bunch.data.features,
                             columns=heart_disease_bunch.data.feature_names,
                             index=heart_disease_bunch.data.ids)

heart_disease = pd.concat([heart_disease, heart_disease_bunch.data.targets], axis=1)
df = heart_disease
print(df.info())
df = df.rename(columns={'num': 'target'})
print(df.head())
print(df.shape)

In [None]:
# Change Name or Columns
# Check the data type of the target column
print("Data type of target column:", df.target.dtype) # Use heart_disease instead of df

# If it's not a numerical type (e.g., object), convert it
if not pd.api.types.is_numeric_dtype(df['target']): # Use heart_disease instead of df
  # Assuming the target values represent categories (0 or 1)
  df['target'] = pd.to_numeric(df['target']) # Use heart_disease instead of df

# print("------ Transformed df (if necessary): ",df.head(5))

column_names = {
    "age": "Age",
    "sex": "Gender",
    "cp": "ChestPainType",
    "trestbps": "RestingBP",
    "chol": "SerumCholesterol",
    "fbs": "FastingBloodSugar",
    "restecg": "RestingECG",
    "thalach": "MaxHeartRate",
    "exang": "ExerciseAngina",
    "oldpeak": "OldPeak",
    "slope": "ExerciseSlope",
    "ca": "MajorVessels",
    "thal": "ThalliumStress",
    "target": "HeartDisease"
}
df.rename(columns=column_names, inplace=True) # Use heart_disease instead of df
print("------- Renamed df:",df) # Use heart_disease instead of df
# Export the entire DataFrame to a CSV file (including all rows and columns)
# df.to_csv('heart_disease_full.csv', index=False)

In [None]:
# DATA CLEANING : Dealing with duplicate observation
# Check for any duplicate observation
duplicate_rows = df.duplicated()
print("Number of duplicate rows before:", duplicate_rows.sum())

# Remove duplicate rows
df = df.drop_duplicates()

# Check for duplicate rows again
duplicate_rows = df.duplicated()
print("Number of duplicate rows after:", duplicate_rows.sum())

In [None]:
# DATA CLEANING : Handling missing values
# Check for missing values in each column
missing_values = df.isnull().sum()

print("Missing values in each column:")
print(missing_values)

# Replace missing values with the median of each column
df = df.fillna(df.median())

# Check if there are any missing values left
missing_values_after = df.isnull().sum()

print("Missing values after replacing with medians:")
print(missing_values_after)

In [None]:
# DATA CLEANING : dealing with outliers
# Calculate IQR for each column
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

# Identify outliers
outliers = ((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).sum()

print("Number of outliers in each column:")
print(outliers)

# Visualize outliers with boxplots
plt.figure(figsize=(10, 5))
sns.boxplot(data=df)
plt.xticks(rotation=45)
plt.title('Boxplots for detecting outliers in each column')
plt.show()

In [None]:
# DATA CLEANING : dealing with outliers
# Identify outliers
outlier_mask = (df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))

# Debug: Print the first few rows of the outlier mask
print("Outlier Mask (first few rows):")
print(outlier_mask.head())

# Impute outliers with the median of each column
for column in df.columns:
    if df[column].dtype in [np.int64, np.float64]:  # Check if the column is numerical
        is_outlier = outlier_mask[column]
        if is_outlier.any():
            median_value = df[column].median()
            df.loc[is_outlier, column] = median_value
            print(f"Replaced outliers in column '{column}' with median value {median_value}")

# Check the DataFrame after imputation
print("Data after imputing outliers:")
print(df.head())

# Verify if there are any remaining outliers
remaining_outliers = ((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).sum()
print("Remaining outliers in each column after imputation:")
print(remaining_outliers)


In [None]:
# DATA CLEANING : dealing with outliers
# Visualize outliers with boxplots after outliers have been imputed with median values
plt.figure(figsize=(10, 5))
sns.boxplot(data=df)
plt.xticks(rotation=45)
plt.title('Boxplots after imputing outliers with median in each column')
plt.show()

There exists an outlier in MaxHeartRate variable, but it also lies on where the left whisker ends, which is the minimum value. Thus we leave it as is and consider it as not an outlier, since removing it would distort the data.