<a href="https://colab.research.google.com/github/Brynlai/DataScienceHeartDiseaseAssignment/blob/Bryan/HeartDiseasev1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# @title
!pip install ucimlrepo
!pip install pandas matplotlib seaborn scikit-learn

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ucimlrepo import fetch_ucirepo
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

# Fetch the Heart Disease dataset from the UCI ML Repository
heart_disease_bunch = fetch_ucirepo(id=45)

# Print the fetched dataset
print(heart_disease_bunch)

In [None]:
# Load into DataFrame
heart_disease = pd.DataFrame(data=heart_disease_bunch.data.features,
                             columns=heart_disease_bunch.data.feature_names,
                             index=heart_disease_bunch.data.ids)

heart_disease = pd.concat([heart_disease, heart_disease_bunch.data.targets], axis=1)
df = heart_disease
print(df.info())

df = df.rename(columns={'num': 'target'})
df['target'] = df['target'].apply(lambda x: 1 if x > 0 else 0)
column_names = {
    "age": "Age",
    "sex": "Gender",
    "cp": "ChestPainType",
    "trestbps": "RestingBP",
    "fbs": "FastBloodSugar",
    "restecg": "RestingECG",
    "exang": "ExerciseAngina",
    "slope": "ExerciseSlope",
    "ca": "MajorVessels",
    "thal": "ThalliumStress",
    "target": "HeartDisease",
    "chol": "SerumCholesterol",
    "thalach": "MaxHeartRate",
    "oldpeak": "OldPeak"
}
df.rename(columns=column_names, inplace=True)
for column in df.columns:
    # Group by the column and get the size of each group
    group_sizes = df.groupby(column).size()
    print(f"Column: {column}")
    print(group_sizes)
    print()  # Empty line for better readability

In [None]:
# DATA CLEANING : Dealing with duplicate observation
# Check for any duplicate observation
duplicate_rows = df.duplicated()
print("Number of duplicate rows before:", duplicate_rows.sum())

# Remove duplicate rows
df = df.drop_duplicates()

# Check for duplicate rows again
duplicate_rows = df.duplicated()
print("Number of duplicate rows after:", duplicate_rows.sum())

In [None]:
# Check for missing values in each column
missing_values = df.isnull().sum()
print("Missing values in each column:")
print(missing_values)

# Replace missing values with the median of each column, only if the column is numerical
for column in df.columns:
    if df[column].dtype in [np.int64, np.float64]:  # Check if the column is numerical
        df[column] = df[column].fillna(df[column].median())

# Check if there are any missing values left
missing_values_after = df.isnull().sum()
print("Missing values after replacing with medians:")
print(missing_values_after)
for column in df.columns:
    # Group by the column and get the size of each group
    group_sizes = df.groupby(column).size()
    print(f"Column: {column}")
    print(group_sizes)
    print()  # Empty line for better readability

Remove Outliers

In [None]:
def remove_outliers(df, columns, threshold=4):
    outliers = {}
    for column in columns:
        mean = df[column].mean()
        std = df[column].std()
        z_scores = np.abs((df[column] - mean) / std)
        outlier_indices = z_scores >= threshold
        outliers[column] = df[column][outlier_indices].tolist()
        df = df[~outlier_indices]
    return df, outliers

# Specify the columns to check for outliers
columns_to_check = ['RestingBP', 'SerumCholesterol', 'MaxHeartRate', 'OldPeak']

# Remove outliers from the DataFrame and print the outliers
df_cleaned, outliers = remove_outliers(df, columns_to_check)
for column, outlier_list in outliers.items():
    print(f"Outliers in {column}: {outlier_list}")

# Create distribution plots before and after removing outliers
fig, axes = plt.subplots(nrows=len(columns_to_check), ncols=2, figsize=(12, 6*len(columns_to_check)))
for i, column in enumerate(columns_to_check):
    sns.histplot(df[column], ax=axes[i, 0], kde=True, bins=20)
    axes[i, 0].set_title(f'Distribution of {column} Before Removing Outliers')
    sns.histplot(df_cleaned[column], ax=axes[i, 1], kde=True, bins=20)
    axes[i, 1].set_title(f'Distribution of {column} After Removing Outliers')
plt.tight_layout()
plt.show()

for column in df.columns:
    # Group by the column and get the size of each group
    group_sizes = df.groupby(column).size()
    print(f"Column: {column}")
    print(group_sizes)
    print()  # Empty line for better readability

In [None]:
for column in df.columns:
    # Group by the column and get the size of each group
    group_sizes = df.groupby(column).size()
    print(f"Column: {column}")
    print(group_sizes)
    print()  # Empty line for better readability

In [None]:
# Correlation matrix
corr_matrix = df.corr()
fig, ax = plt.subplots(figsize=(10, 10))
ax = sns.heatmap(corr_matrix,
                 annot=True,
                 linewidths=0.5,
                 fmt=".2f",
                 cmap="YlGnBu");
bottom, top = ax.get_ylim()
ax.set_ylim()

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Prepare the data
X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']

# Scale the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

np.random.seed(564)
test_sizes = {}
for i in range(15, 23):
    test_sizes[f"{i}%"] = i / 100

count = 0
best_models = {}
for sizeOfT, testSize in test_sizes.items():
    print(f"\n---Iteration {count} Test size: {sizeOfT}:")
    count += 1

    # Split the data using the fixed random state
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testSize, random_state=200)

    # Define the models
    models = {
        "Logistic Regression": LogisticRegression(max_iter=1000, solver='sag'),
        "Decision Tree": DecisionTreeClassifier(),
        "Random Forest": RandomForestClassifier(),
        "Gradient Boosting": GradientBoostingClassifier(),
        "SVM": SVC()
    }

    modelPerformance = {}

    # Perform k-fold cross-validation
    for model_name, model in models.items():
        # Use 5-fold cross-validation as an example
        cv_scores = cross_val_score(model, X_train, y_train, cv=5)
        avg_cv_score = cv_scores.mean()
        print(f"{model_name} with Cross-Validation Accuracy: {avg_cv_score:.3f}")

        # Train and evaluate the model on the test set
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        test_accuracy = accuracy_score(y_test, y_pred)
        print(f"{model_name} with Test Accuracy: {test_accuracy:.3f}")
        modelPerformance[model_name] = (avg_cv_score, test_accuracy)

    # Find and print the best model for this test size based on cross-validation score
    best_model_name = max(modelPerformance, key=lambda x: modelPerformance[x])
    best_model_cv_accuracy, best_model_test_accuracy = modelPerformance[best_model_name]
    print(f"Best Model: {best_model_name} with Cross-Validation Accuracy: {best_model_cv_accuracy:.3f} and Test Accuracy: {best_model_test_accuracy:.3f}")

    # Store the best model for each test size
    best_models[sizeOfT] = (best_model_name, best_model_cv_accuracy, best_model_test_accuracy)

# Print the best models for each test size
print("\nBest Models for Each Test Size:")
for test_size, (best_model, cv_accuracy, test_accuracy) in best_models.items():
    print(f"Test Size: {test_size}, Best Model: {best_model} with Cross-Validation Accuracy: {cv_accuracy:.3f} and Test Accuracy: {test_accuracy:.3f}")

In [None]:
# Import the required module from scipy
from scipy import stats

# Separate the data into two groups based on the presence of heart disease
age_with_disease = df[df['HeartDisease'] == 1]['Age']
age_without_disease = df[df['HeartDisease'] == 0]['Age']

# Perform an independent samples t-test
t_statistic, p_value = stats.ttest_ind(age_with_disease, age_without_disease)

# Set the significance level
alpha = 0.05

# Output the results
print(f'T-statistic: {t_statistic}')
print(f'P-value: {p_value}')

if p_value < alpha:
    print(f"Since {p_value} < {alpha}, we reject the null hypothesis (H₀) and conclude that there is a significant difference in mean age between patients with and without heart disease.")
else:
    print(f"Since {p_value} >= {alpha}, we do not reject the null hypothesis (H₀) and conclude that there is no significant difference in mean age between patients with and without heart disease.")