<a href="https://colab.research.google.com/github/Brynlai/DataScienceHeartDiseaseAssignment/blob/Bryan/AssignmentCompletedV10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# B. Data Understanding
1. Data Collection
2. Data Description
3. Data Exploration

## B.1. Data Collection

In [None]:
# B. Data Understanding - 1. Data Collection
# @title
!pip install ucimlrepo
!pip install pandas matplotlib seaborn scikit-learn

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ucimlrepo import fetch_ucirepo
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

# Fetch Heart Disease dataset from UCI ML Repository
heart_disease_bunch = fetch_ucirepo(id=45)

# Load into DataFrame
heart_disease = pd.DataFrame(data=heart_disease_bunch.data.features,
                             columns=heart_disease_bunch.data.feature_names,
                             index=heart_disease_bunch.data.ids)
heart_disease = pd.concat([heart_disease, heart_disease_bunch.data.targets], axis=1)

## B.2. Data Description

In [None]:
# B. Data Understanding: - 2. Data Description
def print_col(col):
  # Get fequency of each result in groups
  group_sizes = df.groupby(col).size()
  print(f"Column: {'col'}")
  print(group_sizes)
  print()

def print_group_sizes(df):
    for column in df.columns:
        # Get fequency of each result in groups
        print_col(column)

In [None]:
# Copy hear_disease to df.
df = heart_disease

# Show column names, null value count and data type
print(df.info())

# Rename num to target.
df = df.rename(columns={'num': 'target'})


# Rename Columns to make them more readable
column_names = {
    "age": "Age",
    "sex": "Gender",
    "cp": "ChestPainType",
    "trestbps": "RestingBP",
    "fbs": "FastBloodSugar",
    "restecg": "RestingECG",
    "exang": "ExerciseAngina",
    "slope": "ExerciseSlope",
    "ca": "MajorVessels",
    "thal": "ThalliumStress",
    "target": "HeartDisease",
    "chol": "SerumCholesterol",
    "thalach": "MaxHeartRate",
    "oldpeak": "OldPeak"
}
df.rename(columns=column_names, inplace=True)
print(df.head(5))

## B.3 Data Exploration

In [None]:
# Show data types and null
print(df.info())

# Summary Statistics
summary_stats = df.describe()
print(summary_stats)

In [None]:
# Distribution Plots of the columns values
for column in df.columns:
    sns.histplot(df[column], kde=True, bins=20)
    plt.title(f'Distribution of {column}')
    plt.show()

In [None]:
# Create a pair plot with customization
sns.pairplot(df, vars=df.columns, diag_kind='kde', markers='o')
plt.show()

In [None]:
# Correlation matrix
corr_matrix = df.corr()
fig, ax = plt.subplots(figsize=(10, 10))
ax = sns.heatmap(corr_matrix,
                 annot=True,
                 linewidths=0.5,
                 fmt=".2f",
                 cmap="YlGnBu");
bottom, top = ax.get_ylim()
ax.set_ylim()
plt.show()

# C. Data Preperation
1. Data Cleaning
2. Data transformation

## C.1. Data Cleaning

In [None]:
# C. Data Preperation - 1. Data Cleaning
# Check for any duplicate observation
duplicate_rows = df.duplicated()
print("Number of duplicate rows before:", duplicate_rows.sum())

# Remove duplicate rows
df = df.drop_duplicates()

# Check for duplicate rows again
duplicate_rows = df.duplicated()
print("Number of duplicate rows after:", duplicate_rows.sum())

In [None]:
# C. Data Preperation - 1. Data Cleaning
# Check for missing values in each column
missing_values = df.isnull().sum()
print("Missing values in each column:")
print(missing_values)

# Replace missing values with median of each column, only if the column is numerical
for column in df.columns:
    if df[column].dtype in [np.int64, np.float64]:  # Check if the column is numerical
        df[column] = df[column].fillna(df[column].median())

# Check if there are any missing values left
missing_values_after = df.isnull().sum()
print("Missing values after replacing with medians:")
print(missing_values_after)

In [None]:
# C. Data Preperation - 1. Data Cleaning
def remove_outliers(df, columns, threshold=3):
    outliers = {}
    for column in columns:
        mean = df[column].mean()
        std = df[column].std()
        z_scores = np.abs((df[column] - mean) / std)
        outlier_indices = z_scores >= threshold
        outliers[column] = df[column][outlier_indices].tolist()
        df = df[~outlier_indices]
    return df, outliers

# Specify columns to check for outliers
columns_to_check = ['RestingBP', 'SerumCholesterol', 'MaxHeartRate', 'OldPeak']

# Remove outliers from the DataFrame and print the outliers
df_cleaned, outliers = remove_outliers(df, columns_to_check)

# Print the outliers for each column
print("Outliers List\n")
for column, outlier_list in outliers.items():
    print(f"Outliers in {column}: {outlier_list}")
print()


# Create distribution plots before and after removing outliers
fig, axes = plt.subplots(nrows=len(columns_to_check), ncols=2, figsize=(12, 6*len(columns_to_check)))
for i, column in enumerate(columns_to_check):
    sns.histplot(df[column], ax=axes[i, 0], kde=True, bins=20)
    axes[i, 0].set_title(f'Distribution of {column} Before Removing Outliers')
    sns.histplot(df_cleaned[column], ax=axes[i, 1], kde=True, bins=20)
    axes[i, 1].set_title(f'Distribution of {column} After Removing Outliers')
plt.tight_layout()
plt.show()

## C.2. Data Transformation

In [None]:
# Before
print_col('HeartDisease')

# Change anything above 0 into 1
df['HeartDisease'] = df['HeartDisease'].apply(lambda x: 1 if x > 0 else 0)

# After
print_col('HeartDisease')

In [None]:
# C. Data Preperation - 1. Data Transformation
columns_to_binned = ['Age', 'RestingBP', 'SerumCholesterol', 'MaxHeartRate', 'OldPeak']

# # Before:
# print("\n\nBefore Binning: -----------")
# for column in columns_to_binned:
#     print_col(column)

# Visualize before binning
fig, axes = plt.subplots(nrows=len(columns_to_binned), ncols=1, figsize=(8, 6*len(columns_to_binned)))
for i, column in enumerate(columns_to_binned):
    sns.histplot(df[column], ax=axes[i], kde=True, bins=20)
    axes[i].set_title(f'Distribution of {column} Before Binning')
plt.tight_layout()
plt.show()


# Bins for age
age_bins = [10, 20, 30, 40, 50, 60, np.inf]
age_labels = [1, 2, 3, 4, 5, 6]  # Assign numerical labels
df['Age_binned'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels, include_lowest=True)

# Bins for resting blood pressure
trestbps_bins = [0, 100, 120, 140, 160, np.inf]
trestbps_labels = [1, 2, 3, 4, 5]  # Assign numerical labels
df['RestingBP_binned'] = pd.cut(df['RestingBP'], bins=trestbps_bins, labels=trestbps_labels, include_lowest=True)

# Bins for serum cholesterol
chol_bins = [0, 160, 200, 240, 280, np.inf]
chol_labels = [1, 2, 3, 4, 5]  # Assign numerical labels
df['SerumCholesterol_binned'] = pd.cut(df['SerumCholesterol'], bins=chol_bins, labels=chol_labels, include_lowest=True)

# Bins for maximum heart rate achieved
thalach_bins = [0, 90, 120, 150, 180, np.inf]
thalach_labels = [1, 2, 3, 4, 5]  # Assign numerical labels
df['MaxHeartRate_binned'] = pd.cut(df['MaxHeartRate'], bins=thalach_bins, labels=thalach_labels, include_lowest=True)

# Bins for ST depression induced by exercise
oldpeak_bins = [0, 0.5, 1.5, 2.5, 3.5, np.inf]
oldpeak_labels = [1, 2, 3, 4, 5]  # Assign numerical labels
df['OldPeak_binned'] = pd.cut(df['OldPeak'], bins=oldpeak_bins, labels=oldpeak_labels, include_lowest=True)

# Drop original numerical columns
df = df.drop(['Age', 'RestingBP', 'SerumCholesterol', 'MaxHeartRate', 'OldPeak'], axis=1)



columns_binned = ['Age_binned', 'RestingBP_binned', 'SerumCholesterol_binned', 'MaxHeartRate_binned', 'OldPeak_binned']
# After:
# print("\n\nAfter Binning: -----------")
# for column in columns_binned:
#     print_col(column)

# Visualize after binning
fig, axes = plt.subplots(nrows=len(columns_binned), ncols=1, figsize=(8, 6*len(columns_binned)))
for i, column in enumerate(columns_binned):
    sns.histplot(df[column], ax=axes[i], kde=False, bins=None)
    axes[i].set_title(f'Distribution of {column} After Binning')
plt.tight_layout()
plt.show()



In [None]:
# Correlation matrix
corr_matrix = df.corr()
fig, ax = plt.subplots(figsize=(10, 10))
ax = sns.heatmap(corr_matrix,
                 annot=True,
                 linewidths=0.5,
                 fmt=".2f",
                 cmap="YlGnBu");
bottom, top = ax.get_ylim()
ax.set_ylim()
plt.show()

In [None]:
# In the correlation matrix, these attributes were +/- 0.25.
correlated_features = ['HeartDisease','Gender', 'ChestPainType', 'ExerciseAngina', 'ExerciseSlope',
                     'MajorVessels', 'ThalliumStress', 'MaxHeartRate_binned', 'OldPeak_binned']

# Clean and Simple Histograms
for column in correlated_features:
    fig, ax = plt.subplots(figsize=(8, 6))
    sns.histplot(df[column], ax=ax, kde=True, bins=20)
    ax.set_title(f'Distribution of {column}')
    ax.set_xlabel(column)
    ax.set_ylabel('Frequency')
    plt.show()

# D. Modelling
1. Scale
2. Set seed for reproducibility
3. Split data
4. Train Data
5. Print Results

In [None]:
# Select only the relevant features
# In the correlation matrix, these attributes were +/- 0.25.
relevant_features = ['Gender', 'ChestPainType', 'ExerciseAngina', 'ExerciseSlope',
                     'MajorVessels', 'ThalliumStress', 'MaxHeartRate_binned', 'OldPeak_binned']
X = df[relevant_features]
# If you want them all:
# X = df.drop('HeartDisease' , axis=1)
y = df['HeartDisease']

# Scaling the data improves stability, consistency and performance.
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Same random number generated every time: Reproducibility
np.random.seed(564)
test_sizes = {}

#for i in range(5, 51, 5):
# Commonly 80/20, 70/30, 50/50.
# for i in [20, 30, 50]:
for i in [20, 30, 50]:
    test_sizes[f"{i}%"] = i / 100

count = 0
best_models = {}
for sizeOfT, testSize in test_sizes.items():
    print(f"\n--------  Iteration {count} Test size: {sizeOfT}: --------")
    count += 1

    # Split data using the fixed random state
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testSize, random_state=200) # Same split of data everytime

    # All models used
    models = {
        "Logistic Regression": LogisticRegression(),
        "Decision Tree": DecisionTreeClassifier(),
        "SVM": SVC()
    }

    modelPerformance = {}

    # Initialize StratifiedKFold
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=200)

    # Fit the models
    for model_name, model in models.items():
        # Normal method: Train and evaluate the model on the test set
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        test_accuracy = accuracy_score(y_test, y_pred)
        print(f"{model_name} - Test Accuracy: {test_accuracy:.3f}")

        # StratifiedKFold for cross-validation
        cv_scores = cross_val_score(model, X, y, cv=skf)
        avg_cv_score = cv_scores.mean()
        print(f"{model_name} - Cross-Validation Accuracy: {avg_cv_score:.3f}")
        modelPerformance[model_name] = (avg_cv_score, test_accuracy)
        print()


    # Find and print the best model for this test size based on cross-validation score
    best_model_name = max(modelPerformance, key=lambda x: modelPerformance[x])
    best_model_cv_accuracy, best_model_test_accuracy = modelPerformance[best_model_name]
    print(f"Best Model: {best_model_name} - Cross-Validation Accuracy: {best_model_cv_accuracy:.3f} , Test Accuracy: {best_model_test_accuracy:.3f}")

    # Store the best model for each test size
    best_models[sizeOfT] = (best_model_name, best_model_cv_accuracy, best_model_test_accuracy)

In [None]:
# Display best models for each test size
print("\nBest Models for Each Test Size:")
for test_size, (best_model, cv_accuracy, test_accuracy) in best_models.items():
    print(f"Test Size: {test_size}\n  Best Model: {best_model}\n  CV Accuracy: {cv_accuracy:.3f}\n  Test Accuracy: {test_accuracy:.3f}\n")

# Visualization
import matplotlib.pyplot as plt
import numpy as np

test_sizes = list(best_models.keys())
models = ["Logistic Regression", "Decision Tree", "SVM"]

for test_size in test_sizes:
    cv_accuracies = [modelPerformance[model][0] for model in models]
    test_accuracies = [modelPerformance[model][1] for model in models]

    fig, ax = plt.subplots(figsize=(10, 6))
    x = np.arange(len(models))
    bar_width = 0.35

    # Plot cross-validation and test accuracy bars
    ax.bar(x - bar_width/2, cv_accuracies, bar_width, label='CV Accuracy')
    ax.bar(x + bar_width/2, test_accuracies, bar_width, label='Test Accuracy')
    ax.set_xticks(x)
    ax.set_xticklabels(models)
    ax.set_xlabel('Model')
    ax.set_ylabel('Accuracy')
    ax.set_title(f'Model Performance at {test_size}')
    ax.legend(loc='lower left')
    plt.show()


In [None]:
print_group_sizes(df)

In [None]:
# Train the model on the entire training set
model = LogisticRegression()
model.fit(X_train, y_train)
# Function to get user input and predict
def get_user_input_and_predict():
    print("Enter the following values:")

    # Gender (male/female)
    # - male: 1 (206 occurrences)
    # - female: 0 (97 occurrences)
    while True:
        try:
            gender = int(input("Gender (1 for male, 0 for female): \n"
                               "  1: Male (206 occurrences)\n"
                               "  0: Female (97 occurrences)\n"
                               "Enter your gender: "))
            if gender in [0, 1]:
                break
            else:
                print("Please enter 0 for female or 1 for male.")
        except ValueError:
            print("Please enter a numeric value.")

    # Chest Pain Type (typical/atypical/non-anginal/asymptomatic)
    # - typical: 1 (23 occurrences)
    # - atypical: 2 (50 occurrences)
    # - non-anginal: 3 (86 occurrences)
    # - asymptomatic: 4 (144 occurrences)
    while True:
        try:
            chest_pain_type = int(input("Chest Pain Type (1-4): \n"
                                        "  1: Typical angina (23 occurrences)\n"
                                        "  2: Atypical angina (50 occurrences)\n"
                                        "  3: Non-anginal pain (86 occurrences)\n"
                                        "  4: Asymptomatic (144 occurrences)\n"
                                        "Enter your chest pain type: "))
            if 1 <= chest_pain_type <= 4:
                break
            else:
                print("Please enter a value between 1 and 4.")
        except ValueError:
            print("Please enter a numeric value.")

    # Exercise Angina (yes/no)
    # - yes: 1 (99 occurrences)
    # - no: 0 (204 occurrences)
    while True:
        try:
            exercise_angina = int(input("Exercise Angina (1 for yes, 0 for no): \n"
                                        "  1: Yes (99 occurrences)\n"
                                        "  0: No (204 occurrences)\n"
                                        "Enter your exercise angina status: "))
            if exercise_angina in [0, 1]:
                break
            else:
                print("Please enter 0 for no or 1 for yes.")
        except ValueError:
            print("Please enter a numeric value.")

    # Exercise Slope (up/flat/down)
    # - up: 1 (142 occurrences)
    # - flat: 2 (140 occurrences)
    # - down: 3 (21 occurrences)
    while True:
        try:
            exercise_slope = int(input("Exercise Slope (1-3): \n"
                                       "  1: Up (142 occurrences)\n"
                                       "  2: Flat (140 occurrences)\n"
                                       "  3: Down (21 occurrences)\n"
                                       "Enter your exercise slope: "))
            if 1 <= exercise_slope <= 3:
                break
            else:
                print("Please enter a value between 1 and 3.")
        except ValueError:
            print("Please enter a numeric value.")

    # Major Vessels (0-3)
    # - 0: 180 occurrences
    # - 1: 65 occurrences
    # - 2: 38 occurrences
    # - 3: 20 occurrences
    while True:
        try:
            major_vessels = int(input("Major Vessels (0-3): \n"
                                      "  0: No major vessels (180 occurrences)\n"
                                      "  1: One major vessel (65 occurrences)\n"
                                      "  2: Two major vessels (38 occurrences)\n"
                                      "  3: Three major vessels (20 occurrences)\n"
                                      "Enter the number of major vessels: "))
            if 0 <= major_vessels <= 3:
                break
            else:
                print("Please enter a value between 0 and 3.")
        except ValueError:
            print("Please enter a numeric value.")

    # Thallium Stress (normal/fixed_defect/reversible_defect)
    # - normal: 3 (168 occurrences)
    # - fixed_defect: 6 (18 occurrences)
    # - reversible_defect: 7 (117 occurrences)
    while True:
        try:
            thallium_stress = int(input("Thallium Stress (3/6/7): \n"
                                       "  3: Normal (168 occurrences)\n"
                                       "  6: Fixed defect (18 occurrences)\n"
                                       "  7: Reversible defect (117 occurrences)\n"
                                       "Enter your thallium stress result: "))
            if thallium_stress in [3, 6, 7]:
                break
            else:
                print("Please enter 3 for normal, 6 for fixed defect, or 7 for reversible defect.")
        except ValueError:
            print("Please enter a numeric value.")

    # Max Heart Rate Binned (1-5)
    # - 1: Less than 100 beats per minute (3 occurrences)
    # - 2: 100-120 beats per minute (34 occurrences)
    # - 3: 120-140 beats per minute (102 occurrences)
    # - 4: 140-160 beats per minute (146 occurrences)
    # - 5: More than 160 beats per minute (18 occurrences)
    while True:
        try:
            max_heart_rate_binned = int(input("Max Heart Rate Binned (1-5): \n"
                                             "  1: Less than 100 beats per minute (3 occurrences)\n"
                                             "  2: 100-120 beats per minute (34 occurrences)\n"
                                             "  3: 120-140 beats per minute (102 occurrences)\n"
                                             "  4: 140-160 beats per minute (146 occurrences)\n"
                                             "  5: More than 160 beats per minute (18 occurrences)\n"
                                             "Enter your max heart rate binned value: "))
            if 1 <= max_heart_rate_binned <= 5:
                break
            else:
                print("Please enter a value between 1 and 5.")
        except ValueError:
            print("Please enter a numeric value.")

    # Old Peak Binned (1-5)
    # - 1: Less than 0.5 mm (135 occurrences)
    # - 2: 0.5-1.5 mm (83 occurrences)
    # - 3: 1.5-2.5 mm (47 occurrences)
    # - 4: 2.5-3.5 mm (25 occurrences)
    # - 5: More than 3.5 mm (13 occurrences)
    while True:
        try:
            old_peak_binned = int(input("Old Peak Binned (1-5): \n"
                                       "  1: Less than 0.5 mm (135 occurrences)\n"
                                       "  2: 0.5-1.5 mm (83 occurrences)\n"
                                       "  3: 1.5-2.5 mm (47 occurrences)\n"
                                       "  4: 2.5-3.5 mm (25 occurrences)\n"
                                       "  5: More than 3.5 mm (13 occurrences)\n"
                                       "Enter your old peak binned value: "))
            if 1 <= old_peak_binned <= 5:
                break
            else:
                print("Please enter a value between 1 and 5.")
        except ValueError:
            print("Please enter a numeric value.")

    # Create input data list
    input_data = [
        gender,
        chest_pain_type,
        exercise_angina,
        exercise_slope,
        major_vessels,
        thallium_stress,
        max_heart_rate_binned,
        old_peak_binned
    ]

    # Scale the input data
    input_data = scaler.transform([input_data])

    # Predict
    prediction = model.predict(input_data)
    probability = model.predict_proba(input_data)
    print("Prediction: ",prediction)

    print(f"Prediction: {'Yes' if prediction == 1 else 'No'}")
    # Probability[0] for 0, [1] for 1
    print(f"Probability: {probability}")

# Run the function to get user input and predict
get_user_input_and_predict()
