In [3]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.decomposition import PCA # As we already explain how to calculate PCA in our report, we will use a shortcut here
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [58]:
df = pd.read_csv('south_africa.csv').drop('row.names', axis=1)
df_clean = pd.read_csv('south_africa.csv').drop('row.names', axis=1)
df_clean['chd'] = df_clean['chd'].replace({1: 'Yes', 0: 'No'})
df['famhist'] = df['famhist'].replace({'Present': 1,'Absent': 0})
print("Dimensions of the csv file:", df.shape)
df_no_famhist_no_chd = df.drop(['chd'], axis=1)
df_clean.head()

Dimensions of the csv file: (462, 10)


Unnamed: 0,sbp,tobacco,ldl,adiposity,famhist,typea,obesity,alcohol,age,chd
0,160,12.0,5.73,23.11,Present,49,25.3,97.2,52,Yes
1,144,0.01,4.41,28.61,Absent,55,28.87,2.06,63,Yes
2,118,0.08,3.48,32.28,Present,52,29.14,3.81,46,No
3,170,7.5,6.41,38.03,Present,51,31.99,24.26,58,Yes
4,134,13.6,3.5,27.78,Present,60,25.99,57.34,49,Yes


In [46]:
# Standardize the data

x = df.loc[:, df.columns != 'chd'].values
x = StandardScaler().fit_transform(x)
feat_cols = ['feature'+str(i) for i in range(x.shape[1])]
normalised_south_africa = pd.DataFrame(x,columns=feat_cols)
normalised_south_africa.head()

Unnamed: 0,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9
0,1.058564,1.823073,0.478412,-0.295503,1.185854,-0.41847,-0.176786,3.277738,0.629336,2.321986
1,0.277089,-0.790237,-0.15968,0.41214,-0.843274,0.193344,0.671373,-0.612745,1.383115,-0.699866
2,-0.992806,-0.77498,-0.609245,0.884332,1.185854,-0.112563,0.735519,-0.541183,0.218184,-0.699866
3,1.546985,0.842264,0.807126,1.624141,1.185854,-0.214532,1.412621,0.295062,1.040488,2.321986
4,-0.211332,2.171805,-0.599577,0.305351,1.185854,0.703189,-0.012856,1.647775,0.42376,0.81106


In [6]:
# PCA, without the final column

n_components = 9
pca_africa = PCA(n_components=n_components)
fit_africa = pca_africa.fit_transform(x)

In [7]:
principal_africa_Df = pd.DataFrame(data = fit_africa, columns = ['principal component 1', 'principal component 2', 'principal component 3', 'principal component 4', 'principal component 5', 'principal component 6', 'principal component 7', 'principal component 8', 'principal component 9'])
principal_sum = [] # Contains the cumulative sum of the explained variance ratio
for i in range(9):
    principal_sum.append(pca_africa.explained_variance_ratio_[0:i+1].sum()) # Adds to the list the cumulative sum of the explained variance ratio

In [8]:
# Scree plot

fig = go.Figure()

# Individual explained variance
fig.add_trace(go.Scatter(x=['PC1','PC2','PC3','PC4','PC5','PC6','PC7','PC8','PC9'], y=pca_africa.explained_variance_ratio_, mode='lines+markers+text', name='Individual explained variance', text=[f'{var:.2f}' for var in pca_africa.explained_variance_ratio_], textposition='top center', textfont=dict(color='blue')))

# Total explained variance
fig.add_trace(go.Scatter(x=['PC1','PC2','PC3','PC4','PC5','PC6','PC7','PC8','PC9'], y=principal_sum, mode='lines+markers+text', name='Total explained variance', text=[f'{var:.2f}' for var in principal_sum], textposition='top center', textfont=dict(color='red')))

# Add title and names to axes
fig.update_layout(title='Scree Plot of Principal Components', xaxis_title='Principal Components', yaxis_title='Explained Variance Ratio')

fig.show()

In [9]:
loadings = pca_africa.components_.T * np.sqrt(pca_africa.explained_variance_)
loadings_df = pd.DataFrame(loadings, columns=[f'PC{i}' for i in range(1, n_components + 1)], index=df_no_famhist_no_chd.columns)
loadings_df

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9
sbp,0.550476,0.261015,-0.129887,0.191058,-0.196693,0.682202,0.218998,0.159872,-0.00592
tobacco,0.513211,0.502215,0.070762,-0.005212,0.570734,-0.142405,-0.157558,0.332097,-0.020002
ldl,0.56777,-0.398602,0.003223,-0.136507,0.221696,-0.187966,0.643902,0.047773,0.030086
adiposity,0.877862,-0.205501,-0.08501,0.131393,-0.108818,-0.114257,-0.159861,-0.117598,-0.317056
famhist,0.331803,0.001469,0.351404,-0.810948,-0.279514,0.074842,-0.104967,0.126725,-0.012058
typea,-0.031152,-0.309079,0.822645,0.204184,0.293781,0.289746,-0.06357,-0.11244,-0.016995
obesity,0.682628,-0.429262,0.04173,0.297222,-0.259366,-0.151593,-0.272325,0.230052,0.21096
alcohol,0.206454,0.594776,0.476722,0.251493,-0.383505,-0.331154,0.226116,-0.084943,0.013852
age,0.782279,0.211444,-0.140481,-0.153035,0.182985,0.077484,-0.118735,-0.469634,0.167943


In [10]:
fig = px.scatter(principal_africa_Df, x='principal component 1', y='principal component 2',color=df_clean['chd'], title='Principal Component Analysis', trendline="ols", labels={'color':'Coronary Heart Disease'})
fig.show()

In [11]:
fig = px.scatter(principal_africa_Df, x='principal component 5', y='principal component 6',color=df_clean['chd'], title='Principal Component Analysis', trendline="ols", labels={'color':'Coronary Heart Disease'})
fig.show()

A retrospective sample of males in a heart-disease high-risk region
of the Western Cape, South Africa. There are roughly two controls per
case of CHD. Many of the CHD positive men have undergone blood
pressure reduction treatment and other programs to reduce their risk
factors after their CHD event. In some cases the measurements were
made after these treatments. These data are taken from a larger
dataset, described in  Rousseauw et al, 1983, South African Medical
Journal. 

| Column | Description |
|---|---|
| sbp	|	systolic blood pressure |
tobacco	|	cumulative tobacco (kg)
ldl	|	low density lipoprotein cholesterol
adiposity |
famhist	|	family history of heart disease (Present, Absent)
typea	|	type-A behavior
obesity |
alcohol	|	current alcohol consumption
age	|	age at onset
chd	|	response, coronary heart disease

In [63]:
# Drop the 'row.names' and 'chd' columns from the df dataframe

# Standardize the data
scaler = StandardScaler()
df_clean_std = scaler.fit_transform(df)
df_clean_std = pd.DataFrame(df_clean_std, columns=df_clean.columns)

# Add the 'famhist' and 'chd' columns back to the df_clean_std dataframe
df_clean_std['famhist'] = df['famhist']
df_clean_std['chd'] = df['chd']

df_clean_std.head()

Unnamed: 0,sbp,tobacco,ldl,adiposity,famhist,typea,obesity,alcohol,age,chd
0,1.058564,1.823073,0.478412,-0.295503,1,-0.41847,-0.176786,3.277738,0.629336,1
1,0.277089,-0.790237,-0.15968,0.41214,0,0.193344,0.671373,-0.612745,1.383115,1
2,-0.992806,-0.77498,-0.609245,0.884332,1,-0.112563,0.735519,-0.541183,0.218184,0
3,1.546985,0.842264,0.807126,1.624141,1,-0.214532,1.412621,0.295062,1.040488,1
4,-0.211332,2.171805,-0.599577,0.305351,1,0.703189,-0.012856,1.647775,0.42376,1


In [64]:
#Boxplot of the standardized data
features = ['sbp', 'tobacco', 'ldl', 'adiposity', 'typea', 'obesity', 'alcohol', 'age']
fig = px.box(df_clean_std[features], title="Boxplot of the standardized data", labels={'variable':'Features', 'value':'Standardized values', 'color':'Coronary Heart Disease', 1:'yes', 0:'no'}, color=df_clean['chd'], width=1200, height=600)

fig.show()


In [14]:
# Basic statistics summary
stats_summary = df.describe().round(4)
stats_summary

Unnamed: 0,sbp,tobacco,ldl,adiposity,famhist,typea,obesity,alcohol,age,chd
count,462.0,462.0,462.0,462.0,462.0,462.0,462.0,462.0,462.0,462.0
mean,138.3268,3.6356,4.7403,25.4067,0.4156,53.1039,26.0441,17.0444,42.816,0.3463
std,20.4963,4.593,2.0709,7.7807,0.4934,9.8175,4.2137,24.4811,14.609,0.4763
min,101.0,0.0,0.98,6.74,0.0,13.0,14.7,0.0,15.0,0.0
25%,124.0,0.0525,3.2825,19.775,0.0,47.0,22.985,0.51,31.0,0.0
50%,134.0,2.0,4.34,26.115,0.0,53.0,25.805,7.51,45.0,0.0
75%,148.0,5.5,5.79,31.2275,1.0,60.0,28.4975,23.8925,55.0,1.0
max,218.0,31.2,15.33,42.49,1.0,78.0,46.58,147.19,64.0,1.0


In [15]:
# split the data into two groups, one with chd, one without
df_chd = df[df['chd'] == 1] 
df_no_chd = df[df['chd'] == 0]


In [16]:
# Basic statistics summary for the chd group
df_chd.describe().round(4)

Unnamed: 0,sbp,tobacco,ldl,adiposity,famhist,typea,obesity,alcohol,age,chd
count,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0
mean,143.7375,5.5249,5.4879,28.1202,0.6,54.4938,26.6229,19.1452,50.2938,1.0
std,23.6775,5.5651,2.2251,7.058,0.4914,10.2466,4.3913,26.1811,10.6491,0.0
min,102.0,0.0,1.55,9.39,0.0,20.0,14.7,0.0,17.0,1.0
25%,127.5,1.5,3.94,23.4625,0.0,47.75,23.635,0.475,42.75,1.0
50%,138.0,4.13,5.065,28.405,1.0,55.0,26.475,8.33,53.0,1.0
75%,158.5,8.2,6.5825,33.5875,1.0,61.0,28.78,24.5825,59.0,1.0
max,218.0,31.2,14.16,42.49,1.0,78.0,45.72,147.19,64.0,1.0


In [17]:
# Basic statistics summary for the no chd group
df_no_chd.describe().round(4)

Unnamed: 0,sbp,tobacco,ldl,adiposity,famhist,typea,obesity,alcohol,age,chd
count,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0
mean,135.4603,2.6347,4.3442,23.9691,0.3179,52.3675,25.7375,15.9314,38.8543,0.0
std,17.985,3.6121,1.8704,7.7729,0.4664,9.5176,4.0907,23.4995,14.8827,0.0
min,101.0,0.0,0.98,6.74,0.0,13.0,17.75,0.0,15.0,0.0
25%,124.0,0.0,3.0575,17.51,0.0,47.0,22.6025,0.51,27.0,0.0
50%,132.0,1.035,3.98,24.625,0.0,52.5,25.57,6.045,40.0,0.0
75%,144.0,4.2,5.2875,29.9575,1.0,59.0,28.065,22.4225,50.75,0.0
max,214.0,20.0,15.33,42.06,1.0,77.0,46.58,145.29,64.0,0.0


In [18]:
# Risk factors

# Define the risk factors
risk_factors = ['sbp', 'tobacco', 'ldl', 'famhist', 'alcohol']

# Create a new column that counts the number of risk factors for each row
df['num_risk_factors'] = df[risk_factors].gt(df[risk_factors].mean()).sum(axis=1)

# Count the number of rows for each number of risk factors
counts = df['num_risk_factors'].value_counts()

one_or_more = 0

# Print the counts
for i in df['num_risk_factors']:
    if i > 0:
        one_or_more += 1

print(f"Number of rows with one or more risk factors: {one_or_more}")

risks = [one_or_more,0,0,0,0,0]

for i in range(1, len(counts)):
    print(f"Number of rows with exactly {i} risk factors: {counts[i]}")
    risks[i] = counts[i]

df.head()
print(risks)

Number of rows with one or more risk factors: 392
Number of rows with exactly 1 risk factors: 127
Number of rows with exactly 2 risk factors: 108
Number of rows with exactly 3 risk factors: 91
Number of rows with exactly 4 risk factors: 56
Number of rows with exactly 5 risk factors: 10
[392, 127, 108, 91, 56, 10]


In [19]:
risks_percent = [round((x/462)*100,2) for x in risks]
factors = ['1 or more factors', '1 factor', '2 factors', '3 factors', '4 factors', '5 factors']

# Create a bar chart
fig = go.Figure(data=[go.Bar(x=factors, y=risks_percent)])

# Update the layout
fig.update_layout(xaxis_title='Number of Risk Factors', yaxis_title='Percentage of the dataset')

# Add number labels to the bars
fig.update_layout(annotations=[go.layout.Annotation(x=i, y=v, text=str(v), xanchor='center', yanchor='bottom', showarrow=False) for i, v in enumerate(risks_percent)])

# Show the plot
fig.show()


In [41]:
# Select the features of interest
features = ['sbp', 'tobacco', 'ldl', 'adiposity', 'typea', 'obesity', 'alcohol', 'age']

# Standardize the data
df_standardized = StandardScaler().fit_transform(df[features])

# Compute the cosine similarity between the standardized features
cosine_sim = cosine_similarity(df_standardized.T).round(2)
cosine_sim_df = pd.DataFrame(cosine_sim, columns=features, index=features)

# Create a heatmap of the cosine similarity matrix
fig = px.imshow(cosine_sim_df, color_continuous_scale='RdBu', text_auto=True)

# Update the layout of the heatmap
fig.update_layout(title='Cosine Similarity Matrix', width=800, height=800, xaxis={'title':'Features'}, yaxis={'title':'Features'})

# Show the plot
fig.show()

In [21]:

fig = px.scatter_matrix(
    df_clean_std,
    dimensions=features,
    color=df_clean['chd'],
    labels={'color':'Coronary Heart Disease','sbp':'Systolic Blood Pressure', 'tobacco':'Tobacco', 'ldl':'low density lipoprotein', 'adiposity':'Adiposity', 'typea':'Type A', 'obesity':'Obesity', 'alcohol':'Alcohol', 'age':'Age'},
    width=1300,
    height=1300,
    opacity=0.3,
)

fig.update_traces(diagonal_visible=False)
fig.update_layout(title='Scatter Matrix of Heart Disease Dataset')

fig.show()