Data Analytics (CS61061), Project- DA-02,
Name- Avik Pramanick,
Roll No- 23CS60R78.

In [None]:
import pandas as pd
import numpy as np
from scipy.special import erfc
from scipy.stats import mannwhitneyu, chi2_contingency

In [None]:
# Load the dataset
heart_data = pd.read_csv("/content/heart_statlog_cleveland_hungary_final.csv")

In [None]:
# Rename columns
heart_data.columns = ['age', 'gender', 'chest_pain_type', 'resting_bp_s', 'cholesterol','fasting_blood_sugar', 'resting_ecg', 'max_heart_rate','exercise_angina', 'oldpeak', 'st_slope','target']

In [None]:
# Check for missing values
missing_values = heart_data.isnull().sum()

In [None]:
# Display the shape of the dataset
dataset_shape = heart_data.shape

In [None]:
# Define feature and target variables
features = ['age', 'gender', 'chest_pain_type', 'resting_bp_s', 'cholesterol','fasting_blood_sugar', 'resting_ecg', 'max_heart_rate','exercise_angina', 'oldpeak', 'st_slope']
target = ['target']

In [None]:
# Categorize features
nominal_features=['gender', 'chest_pain_type', 'fasting_blood_sugar', 'resting_ecg', 'exercise_angina', 'st_slope', 'target']
nominal_features = ['chest_pain_type', 'resting_ecg', 'st_slope']
categorical_features = ['gender', 'fasting_blood_sugar', 'exercise_angina']
numeric_features = ['age', 'resting_bp_s', 'cholesterol', 'max_heart_rate', 'oldpeak']

In [None]:
# Dictionary to store p-values
p_values_dict = {}

In [None]:

# Function for Mann-Whitney U test
def mann_whitney_u_test(sample1, sample2):
    n1 = len(sample1)
    n2 = len(sample2)

    # Combine the samples and assign ranks
    combined = np.concatenate([sample1, sample2])
    ranks = np.argsort(combined)

    # Calculate U statistic
    u1 = np.sum(ranks[:n1])
    u2 = n1 * (n1 + n2 + 1) - u1
    u = min(u1, u2)

    # Calculate expected U under the null hypothesis
    expected_u = n1 * n2 / 2

    # Calculate variance of U
    var_u = n1 * n2 * (n1 + n2 + 1) / 12

    # Calculate z-score
    z = (u - expected_u) / np.sqrt(var_u)

    # Calculate two-tailed p-value without using norm.cdf
    p_value = 2 * erfc(np.abs(z) / np.sqrt(2))

    return u, p_value

In [None]:
# Function for chi-square test
def chi_square_test(observed, num_simulations=10000):
    observed = np.array(observed)

    # Calculate expected frequencies under independence assumption
    row_totals = np.sum(observed, axis=1)
    col_totals = np.sum(observed, axis=0)
    total = np.sum(observed)

    expected = np.outer(row_totals, col_totals) / total

    # Calculate chi-square statistic
    chi2_statistic = np.sum((observed - expected)**2 / expected)

    # Calculate degrees of freedom
    df = (observed.shape[0] - 1) * (observed.shape[1] - 1)

    # Simulate null hypothesis distribution
    null_distribution = np.zeros(num_simulations)
    for i in range(num_simulations):
        simulated_row_totals = np.random.multinomial(total, row_totals / total)
        simulated_col_totals = np.random.multinomial(total, col_totals / total)

        simulated_data = np.outer(simulated_row_totals, simulated_col_totals) / total

        null_distribution[i] = np.sum((simulated_data - expected)**2 / expected)

    # Calculate empirical p-value
    p_value = 2 * (1 - np.sum(null_distribution > chi2_statistic) / num_simulations)

    return chi2_statistic, df, p_value


In [None]:
# Perform statistical tests for each variable
for feature in features:
    if feature in numeric_features:
        # Perform Mann-Whitney U test for continuous variable
        u_statistic, mw_p_value = mann_whitney_u_test(heart_data[heart_data['target'] == 0][feature], heart_data[heart_data['target'] == 1][feature])
        p_values_dict[feature] = mw_p_value
    elif feature in nominal_features + categorical_features:
        # Perform Chi-square test for nominal and categorical variable
        contingency_table = pd.crosstab(heart_data[feature], heart_data['target'])
        chi2_stat, chi2_df, chi2_p_value = chi_square_test(contingency_table)
        p_values_dict[feature] = chi2_p_value

In [None]:
# Sort p-values in ascending order
sorted_p_values = sorted(p_values_dict.items(), key=lambda x: x[1])

In [None]:




# Display the three most significant independent attributes
for feature, p_value in sorted_p_values[:3]:
    print(f"{feature}: p-value = {p_value}")

resting_bp_s: p-value = 5.2031730724787486e-145
cholesterol: p-value = 2.872275326406404e-131
max_heart_rate: p-value = 2.9746049597922405e-90
