<a href="https://colab.research.google.com/github/22053604/Cardiovascular-Health-Analysis/blob/main/Development_Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Version Control**

# Pseudocode

....

In [1]:
#import necessary libraries
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ipywidgets as widgets

In [15]:
#Established a connection to the SQLite database
db_path = '/content/cardio_health.db'
conn = sqlite3.connect('cardiohealth.db')
cursor = conn.cursor()

#Retrieve and display table names
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()
print("Tables in the database:", tables)

Tables in the database: [('medical_examination',)]


In [36]:
#Export all tables to CSV files &  Panadas dataframe
for table_name in tables:
    df = pd.read_sql(f"SELECT * FROM {table_name[0]}", conn)
    csv_file_name = f"{table_name[0]}.csv"
    df.to_csv(csv_file_name, index=False)
    print(f"Exported {table_name[0]} to {csv_file_name}")

#Load data from medical_examnination table
query = "SELECT * FROM medical_examination"
df = pd.read_sql(query, conn)

#Display tables
df = pd.read_csv('/content/medical_examination.csv')
print(df.head())

Exported medical_examination to medical_examination.csv
   id    age  gender  height  weight  ap_hi  ap_lo  cholesterol  gluc  smoke  \
0   0  18393       2     168    62.0    110     80            1     1      0   
1   1  20228       1     156    85.0    140     90            3     1      0   
2   2  18857       1     165    64.0    130     70            3     1      0   
3   3  17623       2     169    82.0    150    100            1     1      0   
4   4  17474       1     156    56.0    100     60            1     1      0   

   alco  active  cardio  
0     0       1       0  
1     0       1       1  
2     0       0       1  
3     0       1       1  
4     0       0       0  


In [48]:
#Show the missing values
print("Missing values in dataset: ")
print(df.isnull().sum())

# Remove duplicates and handle missing values
df_cleaned = df.drop_duplicates()

#Print table again after removing duplicates
print("After removing duplicates:")
print(df_cleaned.head())

#Converting age from days to years
df['age_years'] = df['age'] / 365



Missing values in dataset: 
id              0
age             0
gender         11
height          0
weight          0
ap_hi           0
ap_lo           0
cholesterol     0
gluc            0
smoke           0
alco            0
active          0
cardio          0
age_years       0
age_group       0
dtype: int64
After removing duplicates:
   id    age  gender  height  weight  ap_hi  ap_lo        cholesterol  gluc  \
0   0  18393    male     168    62.0    110     80             Normal     1   
1   1  20228  female     156    85.0    140     90  Well Above Normal     1   
2   2  18857  female     165    64.0    130     70  Well Above Normal     1   
3   3  17623    male     169    82.0    150    100             Normal     1   
4   4  17474  female     156    56.0    100     60             Normal     1   

   smoke  alco  active  cardio  age_years age_group  
0      0     0       1       0  50.391781     50-59  
1      0     0       1       1  55.419178     50-59  
2      0     0       0   

In [73]:
#Grouping into category groups

#Define Age into bins and labels
age_bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
age_labels = ['0-9', '10-19','20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80-89', '90-100']
#Create age columns
df['age_group'] = pd.cut(df['age_years'], bins=age_bins, labels=age_labels, right=False)


#Change data into understandable format

#Transform Gender codes into understandable format (1 = female, 2 = male)
df['gender'] = df['gender'].map({1: 'female', 2: 'male'})

#Transform Smoker codes into understandable format (0 = non-smoker, 1 = smoker)
df['smoking_status'] = df['smoke'].map({0: 'non-smoker', 1: 'smoker'})

#Transform Drinker codes into understandable format (0 = non-drinker, 1 = drinker)
df['alcohol_consumption'] = df['alco'].map({0: 'non-drinker', 1: 'drinker'})

#Transform Cholesterol levels into readable categories
df['cholesterol'] = df['cholesterol'].map({1: 'Normal', 2: 'Above Normal', 3: 'Well Above Normal'})

# Data Functions & Graph Visualisations

In [71]:
# Data analysis functions
def analyse_combination(df, factors):
    query_str = " & ".join([f"{key} {value}" for key, value in factors.items()])
    filtered_data = df.query(query_str)
    return filtered_data

# Example analysis: Demographic and Lifestyle Factors
factors_combination_1 = {'gender': "== 'female'", 'smoke': '== 1'}
combination_1_result = analyse_combination(df, factors_combination_1)
print("Female smokers:")
print(combination_1_result.head())

Female smokers:
Empty DataFrame
Columns: [id, age, gender, height, weight, ap_hi, ap_lo, cholesterol, gluc, smoke, alco, active, cardio, age_years, age_group, smoking_status]
Index: []


Note: 0 blue & 1 orange = should be smoker/non-smoker
Risk in percentage


# Shanto's data combinations functions

*   Used Gen AI prompt & response to fix and combine



In [72]:
#Create combination 1 (demographic: female, and lifestyle: smoker)
demographic_lifestyle_combination = df_cleaned[(df_cleaned['gender'] == 1) & (df_cleaned['smoke'] == 1)]
print(demographic_lifestyle_combination.head())

#Create combination 2 (demographic: female, and lifestyle: smoker)
age_cholesterol_combination = df_cleaned[(df_cleaned['age_years'] > 50) & (df_cleaned['cholesterol'] != 'Normal')]
print(age_cholesterol_combination.head())

age_bp_activity = df_cleaned[(df_cleaned['age_years'] > 40) & (df_cleaned['ap_hi'] > 120) & (df_cleaned['active'] == 1)]
print(age_bp_activity.head())

Empty DataFrame
Columns: [id, age, gender, height, weight, ap_hi, ap_lo, cholesterol, gluc, smoke, alco, active, cardio, age_years, age_group]
Index: []
   id    age  gender  height  weight  ap_hi  ap_lo        cholesterol  gluc  \
1   1  20228  female     156    85.0    140     90  Well Above Normal     1   
2   2  18857  female     165    64.0    130     70  Well Above Normal     1   
5   8  21914  female     151    67.0    120     80       Above Normal     2   
6   9  22113  female     157    93.0    130     80  Well Above Normal     1   
7  12  22584    male     178    95.0    130     90  Well Above Normal     3   

   smoke  alco  active  cardio  age_years age_group  
1      0     0       1       1  55.419178     50-59  
2      0     0       0       1  51.663014     50-59  
5      0     0       0       0  60.038356     60-69  
6      0     0       1       0  60.583562     60-69  
7      0     0       1       1  61.873973     60-69  
    id    age  gender  height  weight  ap_hi  ap