In [None]:
import os

# List files in the dataset directory
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

heart_2020_cleaned = pd.read_csv('/kaggle/input/personal-key-indicators-of-heart-disease/2020/heart_2020_cleaned.csv')
print('Head:\n', heart_2020_cleaned.head())
print('Shape:\n', heart_2020_cleaned.shape)
print('Info:\n', heart_2020_cleaned.info())
print('Null:\n', heart_2020_cleaned.isnull().sum())
print('Duplicated:\n', heart_2020_cleaned.duplicated().sum())
print('Describe:\n', heart_2020_cleaned.describe())

In [None]:
heart_2020_cleaned['Sex'].unique()

In [None]:
heart_2020_cleaned['Diabetic'].unique()

In [None]:
mapping = {
    'Male': 0,
    'Female': 1
}
heart_2020_cleaned['Sex'] = heart_2020_cleaned['Sex'].map(mapping)
heart_2020_cleaned.head()

mapping = {
    'No': 0,
    'Yes': 1
}
heart_2020_cleaned['HeartDisease'] = heart_2020_cleaned['HeartDisease'].map(mapping)
heart_2020_cleaned['Smoking'] = heart_2020_cleaned['Smoking'].map(mapping)
heart_2020_cleaned['AlcoholDrinking'] = heart_2020_cleaned['AlcoholDrinking'].map(mapping)
heart_2020_cleaned['Stroke'] = heart_2020_cleaned['Stroke'].map(mapping)
heart_2020_cleaned['DiffWalking'] = heart_2020_cleaned['DiffWalking'].map(mapping)
heart_2020_cleaned['PhysicalActivity'] = heart_2020_cleaned['PhysicalActivity'].map(mapping)
heart_2020_cleaned['Asthma'] = heart_2020_cleaned['Asthma'].map(mapping)
heart_2020_cleaned['KidneyDisease'] = heart_2020_cleaned['KidneyDisease'].map(mapping)
heart_2020_cleaned['SkinCancer'] = heart_2020_cleaned['SkinCancer'].map(mapping)
heart_2020_cleaned.head()

In [None]:
heart_2020_cleaned['GenHealth'].unique()

In [None]:
mapping = {
    'Poor': 0,
    'Fair': 1,
    'Good': 2,
    'Very good': 3,
    'Excellent': 4
}

heart_2020_cleaned['GenHealth'] = heart_2020_cleaned['GenHealth'].map(mapping)
heart_2020_cleaned.head()

In [None]:
heart_2020_cleaned['AgeCategory'].unique()

In [None]:
mapping = {
    '18-24': 0,
    '25-29': 1,
    '30-34': 2,
    '35-39': 3,
    '40-44': 4,
    '45-49': 5,
    '50-54': 6,
    '55-59': 7,
    '60-64': 8,
    '65-69': 9,
    '70-74': 10,
    '75-79': 11,
    '80 or older': 12
}

heart_2020_cleaned['AgeCategory'] = heart_2020_cleaned['AgeCategory'].map(mapping)
heart_2020_cleaned.head()

In [None]:
heart_2020_cleaned['Race'].unique()

In [None]:
mapping = {
    'White': 0,
    'Black': 1,
    'Hispanic': 2,
    'Asian': 3,
    'American Indian/Alaskan Native': 4,
    'Other': 5
}

heart_2020_cleaned['Race'] = heart_2020_cleaned['Race'].map(mapping)
heart_2020_cleaned.head()

In [None]:
print('Null:\n', heart_2020_cleaned.isnull().sum())

In [None]:
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

X = heart_2020_cleaned.drop('HeartDisease', axis=1)
y = heart_2020_cleaned['HeartDisease']

categorical_cols = X.select_dtypes(include=['object']).columns

le = LabelEncoder()
y = le.fit_transform(y)
for col in categorical_cols:
    X[col] = le.fit_transform(X[col])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

features = X.columns
importances = clf.feature_importances_
feature_importances_df = pd.DataFrame({'Feature': features, 'Importance': importances})
feature_importances_df.sort_values(by='Importance', ascending=False)
print(feature_importances_df)
feature_importances_df.plot(kind='bar', x='Feature', y='Importance')

In [None]:
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, Perceptron, SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [None]:
models = {
#     'AdaBoostClassifier': AdaBoostClassifier(),
#     'BaggingClassifier': BaggingClassifier(),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
#     'DummyClassifier': DummyClassifier(),
    'KNeighborsClassifier': KNeighborsClassifier(),
#     'LogisticRegression': LogisticRegression(),
#     'LogisticRegressionCV': LogisticRegressionCV(),
    'MLPClassifier': MLPClassifier(),
    'Perceptron': Perceptron(),
    'RandomForestClassifier': RandomForestClassifier(),
#     'SGDClassifier': SGDClassifier(),
    'SVC': SVC()
}

In [None]:
# Fit and evaluate models
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    accuracy = model.score(X_test, y_test)
    results[name] = accuracy

# Convert results to DataFrame
results_df = pd.DataFrame(list(results.items()), columns=['Model', 'Accuracy'])
print(results_df)

In [None]:
from mlxtend.frequent_patterns import apriori, association_rules

# Prepare the data for association rule mining
thresholds = X.mean()
binarized_data = (X > thresholds).astype(int)

# Perform Apriori algorithm
frequent_itemsets = apriori(binarized_data, min_support=0.1, use_colnames=True)

# Generate association rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.6)

# Sort rules by confidence
rules = rules.sort_values(by='confidence', ascending=False)

print(rules)

In [None]:
# Filter rules based on lift and confidence
filtered_rules = rules[(rules['lift'] > 1.2) & (rules['confidence'] > 0.7)]

# Merge with feature importances to see which important features are involved in rules
rules_with_importance = filtered_rules.copy()
rules_with_importance['antecedents'] = rules_with_importance['antecedents'].apply(lambda x: ', '.join(list(x)))
rules_with_importance['consequents'] = rules_with_importance['consequents'].apply(lambda x: ', '.join(list(x)))

# Merge with feature importance DataFrame
rules_with_importance = rules_with_importance.merge(feature_importances_df, left_on='antecedents', right_on='Feature', how='left')

print(rules_with_importance[['antecedents', 'consequents', 'support', 'confidence', 'lift', 'Importance']])

In [None]:
from sklearn.tree import _tree

def get_rules(tree, feature_names, class_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]
    
    paths = []
    
    def recurse(node, path, paths):
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            path.append(f"({name} <= {threshold})")
            recurse(tree_.children_left[node], path, paths)
            path.pop()
            path.append(f"({name} > {threshold})")
            recurse(tree_.children_right[node], path, paths)
            path.pop()
        else:
            path.append(f"then class: {class_names[tree_.value[node].argmax()]} (proba: {tree_.value[node].max() / tree_.value[node].sum() * 100:.2f}%) based on {tree_.n_node_samples[node]} samples")
            paths.append(path[:])
            path.pop()
    
    recurse(0, [], paths)
    
    rules = [" and ".join(path) for path in paths]
    return rules

# Get the rules
rules = get_rules(clf, X.columns, le.classes_)
for rule in rules:
    print(rule)