1. Run "pip install kaggle" 

2. Move "kaggle.json" to following Path /Users/linh/.kaggle

Importing relevent libraries

In [1]:
import pandas as pd 
from cleaning import *  # Import all functions from cleaning
import zipfile
import kaggle
from kaggle.api.kaggle_api_extended import KaggleApi  # Import Kaggle Api
import requests
from bs4 import BeautifulSoup
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Initiliaze API
api = KaggleApi()
api.authenticate()
api.model_list_cli()

# Donwload Dataset via API unzipped
api.dataset_download_files(dataset="alexteboul/heart-disease-health-indicators-dataset", unzip=True)

# Saving DataSets into DF
usa_df = pd.read_csv("heart_disease_health_indicators_BRFSS2015.csv")
india_df = pd.read_csv("./Data/CVD_india_data_raw.csv")

Scrape norm tables from Wikipedia

In [3]:
# blood pressure
url = "https://en.wikipedia.org/wiki/Blood_pressure"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

table = soup.find("table", class_="wikitable")

categories = []
office_data = []
relevant_data = False

for row in table.find_all('tr'):
    header_cells = row.find_all('th')
    if header_cells and "European Society of Cardiology" in header_cells[0].get_text():
        relevant_data = True
        continue
    if header_cells and "European Society of Hypertension" in header_cells[0].get_text():
        relevant_data = False
        break
    if relevant_data:
        data_cells = row.find_all('td')
        if len(data_cells) > 1:  # Ensures we have enough columns
            categories.append(data_cells[0].get_text(strip=True))
            office_data.append(data_cells[1].get_text(strip=True))

bp_df = pd.DataFrame({'Category': categories, 'Office': office_data})
bp_thresh = int(bp_df[bp_df['Category'] == 'Non-elevated']['Office'].values[0].split("<")[1])
bp_thresh

120

In [4]:
# cholesterol
url = "https://en.wikipedia.org/wiki/Cholesterol"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

table = soup.find("table", class_="wikitable")
rows = table.find("tbody").find_all('tr')

chol_data = []
selected_rows = [1, 2, 3, 4]

for i in selected_rows:
    if i == 1:
        cell = rows[i].find('th').get_text(strip=True)
    else:
        cell = rows[i].find('td').get_text(strip=True)
    chol_data.append(cell)

chol_data
chol_thresh = int(chol_data[1].strip("< "))
chol_thresh

200

Creating new binary columns for blood pressure and cholesterol in India data which match the US data

In [6]:
india_df["HighBP"] = india_df["restingBP"].apply(lambda x: 1 if x >= bp_thresh else 0)
india_df["HighChol"] = india_df["serumcholestrol"].apply(lambda x: 1 if x >= chol_thresh else 0)

[1 0]
[0 1]


Clean and Shaping Data

In [None]:
# apply functions for cleaning check
check_nan(india_df)
check_unique(india_df)
check_nan(usa_df)
check_unique(usa_df)

In [7]:
# Synchronize matching columns
usa_df.columns = usa_df.columns.str.lower()
india_df.columns = india_df.columns.str.lower()

In [8]:
# Aligning Dtypes
usa_df = usa_df.apply(lambda x:x.astype(int))
india_df = india_df.apply(lambda x:x.astype(int, errors= "ignore"))

In [9]:
# Checking and konsolodate the columns
# Renaming India columns
india_df = india_df.rename(columns={"target":"cvd"})

# Renaming usa columns
usa_df = usa_df.rename(columns={"sex":"gender"})
usa_df = usa_df.rename(columns={"heartdiseaseorattack":"cvd"})

In [10]:
# assgning gender 
# sacrificing performance for readibility in this case int to object
gender = {
    1:"m",
    0:"f"
}
india_df["gender"] = india_df["gender"].map(gender)  
usa_df["gender"] = usa_df["gender"].map(gender)

In [11]:
# add country column to both dataframes
india_df['country'] = 'india'
usa_df['country'] = 'usa'

In [1]:
# Concantenate the DataFrames
merged = pd.concat([india_df, usa_df], axis=0, ignore_index=True)
# find common cols
common_columns = india_df.columns.intersection(usa_df.columns)
# drop all columns except those with data in both samples: age, gender, heartdiseaseorattack, highbp, highchol
merged = merged[common_columns]

# cetegorize india age
merged.loc[merged['country'] == 'india', 'age'] = pd.cut(merged.loc[merged['country'] == 'india', 'age'],
                                                               bins=range(18, 85, 5),
                                                               labels=range(1, 14),
                                                               right=True).astype(int)

grouped = (merged.groupby("country")["age"].value_counts(normalize=True).sort_index() * 100).round()

# Print the percentage of each age value for India and USA separately
# print(grouped)
print(merged.head())

# save
merged.to_csv("./Data/merged_data.csv")

NameError: name 'pd' is not defined

Data Analysis

Analysis on combined data

In [13]:
# H1: The prevalence of high blood pressure is higher in the USA compared to India
# H2: The prevalence of high cholesterol is higher in the USA compared to India
# H3: The prevalence of heart disease is higher in the USA compared to India
grouped = merged.groupby("country")[["highbp", "highchol", "cvd"]].mean() * 100

print("Prevalence by Country:")
print(grouped)
print()

# Logical statements to compare the USA and India for each condition
print("H1: The prevalence of high blood pressure is higher in the US compared to India:", grouped.loc["usa", "highbp"] > grouped.loc["india", "highbp"])
print("H2: The prevalence of high cholesterol is higher in the US compared to India:", grouped.loc["usa", "highchol"] > grouped.loc["india", "highchol"])
print("H3: The prevalence of CVD is higher in the US compared to India:", grouped.loc["usa", "cvd"] > grouped.loc["india", "cvd"])

Prevalence by Country:
           highbp   highchol        cvd
country                                
india    89.40000  82.400000  58.000000
usa      42.90011  42.412094   9.418559

H1: The prevalence of high blood pressure is higher in the US compared to India: False
H2: The prevalence of high cholesterol is higher in the US compared to India: False
H3: The prevalence of CVD is higher in the US compared to India: False


Analysis on US sample

In [14]:
# H4: The prevalence of cvd is higher among smokers than non-smokers
smoker = usa_df[usa_df["smoker"] == 1]
non_smoker = usa_df[usa_df["smoker"] == 0]

# Calculating the percentage of individuals with heart disease in each group
cvd_smoker = (smoker["cvd"].mean()) * 100
cvd_non_smoker = (non_smoker["cvd"].mean()) * 100

print(f"CVD prevalence among smokers: {cvd_smoker:.2f}%")
print(f"CVD prevalence among non-smokers: {cvd_non_smoker:.2f}%")
print("H4: The prevalence of CVD is higher among smokers than non-smokers:" , cvd_smoker > cvd_non_smoker)

CVD prevalence among smokers: 13.17%
CVD prevalence among non-smokers: 6.44%
H4: The prevalence of CVD is higher among smokers than non-smokers: True


In [15]:
# H5: The prevalence of cvd is higher among those with high BP
high_bp = usa_df[usa_df["highbp"] == 1]
no_high_bp = usa_df[usa_df["highbp"] == 0]

# Calculating the percentage of individuals with heart disease in each group
heart_disease_high_bp = (high_bp["cvd"].mean()) * 100
heart_disease_no_high_bp = (no_high_bp["cvd"].mean()) * 100

print(f"Percentage with heart disease among those with high BP: {heart_disease_high_bp:.2f}%")
print(f"Percentage with heart disease among those without high BP: {heart_disease_no_high_bp:.2f}%")
print("H5: The prevalence of CVD is higher among individuals with high BP:" , heart_disease_high_bp > heart_disease_no_high_bp)

Percentage with heart disease among those with high BP: 16.47%
Percentage with heart disease among those without high BP: 4.12%
H5: The prevalence of CVD is higher among individuals with high BP: True


In [16]:
# H6: The prevalence of cvd is higher among those with high chol

Visualizing the Data

In [17]:
# Defining relevant Plots