In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind
import seaborn as sns

In [None]:
#carregamento e pré-visualização do dataset
data = pd.read_csv('dataset/heart_disease_dataset.csv')
print(data.head())
print(data.info())

In [None]:
#seleção e demonstração de valores numéricos
numerical_columns = data.select_dtypes(include=['int64']).columns
numerical_data = data[numerical_columns]
print(numerical_columns)
print(numerical_data.head())
numerical_data = numerical_data.fillna(numerical_data.mean())

In [None]:
#normalização e transposição dos dados
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numerical_data)

print(scaled_data[:5])
features = numerical_data.drop(columns=['Heart Disease'])
scaled_features = scaler.fit_transform(features)

transposed_features = pd.DataFrame(scaled_features.T, index=features.columns, columns=features.index)


In [None]:
#PCA
pca = PCA(n_components=5)
pca_data = pca.fit_transform(scaled_features)

pca_df = pd.DataFrame(data=pca_data, columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5'])
print(pca_df.head())

#Transposição para análise do gráficos de PC, não utilizado no processo final
pca_transposed_data = pca.fit_transform(transposed_features)
pca_features_df = pd.DataFrame(data=pca_transposed_data, 
                                columns=[f'PC{i+1}' for i in range(pca_transposed_data.shape[1])],
                                index=transposed_features.index)
print(pca_features_df.head())


In [None]:
#gráfico para visualização de valores relevantes
plt.figure(figsize=(10, 6))
plt.bar(pca_features_df.index, pca_features_df['PC4'], color='skyblue')
plt.xlabel('Features')
plt.ylabel('Contribution to PC4')
plt.title('Feature Contributions to Principal Component 1')
plt.xticks(rotation=90)
plt.show()

In [None]:
#seleção de valores específicos
selected_features = numerical_data[['Stress Level', 'Heart Rate', 'Blood Pressure','Age']]
scaled_selected_features = scaler.fit_transform(selected_features)

pca_selected = PCA(n_components=2)
pca_selected_data = pca_selected.fit_transform(scaled_selected_features)

pca_selected_df = pd.DataFrame(pca_selected_data, columns=['PC4', 'PC5'])

lbl = 'Stress Level'
c = numerical_data[lbl]

#visualização dos resultados
plt.figure(figsize=(8, 6))
plt.scatter(pca_selected_df['PC4'], pca_selected_df['PC5'], c=c, cmap='coolwarm', edgecolor='k', alpha=0.7)
plt.colorbar(label=lbl)
plt.xlabel('PC4')
plt.ylabel('PC5')
plt.title('PCA on Selected Features')
plt.show()
print(pca_selected_df)

In [None]:
#Demonstração de Autovalores
eigenvalues = pca.explained_variance_
print("Eigenvalues (Variance explained by each PC):\n", eigenvalues)

#Demonstração de Autovetores
eigenvectors = pca.components_
print("Eigenvectors (Principal Component Directions):\n", eigenvectors)

In [None]:
#Visualização dos autovalores utilizando matplotlib
plt.figure(figsize=(8, 6))
plt.plot(range(1, len(eigenvalues)+1), eigenvalues, marker='o', linestyle='--')
plt.xlabel('Principal Component')
plt.ylabel('Eigenvalue (Variance Explained)')
plt.title('Scree Plot')
plt.show()

In [None]:
#Organiza Autovalores e Autovetores por ordem de tamanho
sorted_indices = np.argsort(eigenvalues)[::-1]
sorted_eigenvalues = eigenvalues[sorted_indices]
sorted_eigenvectors = eigenvectors[sorted_indices]

print("Largest Eigenvalues:\n", sorted_eigenvalues)
print("\nEigenvectors Corresponding to Largest Eigenvalues:\n", sorted_eigenvectors)

In [None]:
#Cálculo da média de pressão sanguínea por problemas cardíacos
mean_bp = numerical_data.groupby('Heart Disease')['Blood Pressure'].mean()
print("Mean Blood Pressure by Heart Disease Status:\n", mean_bp)
import matplotlib.pyplot as plt

In [None]:
#Procurando correlação entre incidência de alta pressão sanguínea e índices de problemas cardiovasculares
mean_bp = numerical_data.groupby('Heart Disease')['Blood Pressure'].mean()
plt.figure(figsize=(8, 6))
sns.boxplot(hue ='Heart Disease', y='Blood Pressure', data=numerical_data, palette='coolwarm')
legend=False
plt.xlabel('Heart Disease (0 = No, 1 = Yes)')
plt.ylabel('Blood Pressure')
plt.title('Blood Pressure Distribution by Heart Disease Status')
plt.show()

In [None]:
#Investigando com valores estatísticos a irrelevância do nível de estresse para incidência de problemas cardíacos
bp_no_disease = numerical_data[numerical_data['Heart Disease'] == 0]['Blood Pressure']
bp_with_disease = numerical_data[numerical_data['Heart Disease'] == 1]['Blood Pressure']

t_stat, p_value = ttest_ind(bp_no_disease, bp_with_disease)
print("T-Test Results:")
print(f"T-statistic: {t_stat}, P-value: {p_value}")

if p_value < 0.05:
    print("The difference in blood pressure between the groups is statistically significant.")
else:
    print("The difference in blood pressure between the groups is not statistically significant.")

In [None]:
plt.figure(figsize=(8, 6))
sns.stripplot(x='Heart Disease', y='Blood Pressure', data=numerical_data, palette='coolwarm', jitter=True, alpha=0.6)
plt.xlabel('Heart Disease (0 = No, 1 = Yes)')
plt.ylabel('Blood Pressure')
plt.title('Blood Pressure Distribution by Heart Disease Status - Strip Plot')
plt.show()


In [None]:
plt.figure(figsize=(8, 6))
sns.stripplot(x='Heart Disease', y='Stress Level', data=numerical_data, palette='coolwarm', jitter=True, alpha=0.6)
plt.xlabel('Heart Disease (0 = No, 1 = Yes)')
plt.ylabel('Stress Level')
plt.title('Stress Level Distribution by Heart Disease Status - Strip Plot')
plt.show()