## Exploratory Data Analysis

### Import Libraries

In [12]:
import sys
import polars as pl
import polars.selectors as pol_sel

import pandas as pd

### Show Python & Library Versions

In [13]:
l = 8
r = 12

print("Python".rjust(l), ":", sys.version[0:6].ljust(r))
print("Polars".rjust(l), ":", pl.__version__.ljust(r))

  Python : 3.11.4      
  Polars : 1.12.0      


## Exploratory Data Analysis with Polars 

### Load CSV File into Polars DataFrame

In [14]:
df = pl.read_csv("data/dataset.csv",
                 ignore_errors=True)

df

Age,Gender,Smoking,Alcohol_Consumption,Physical_Activity_Level,BMI,Diabetes,Hypertension,Cholesterol_Level,Resting_BP,Heart_Rate,Family_History,Stress_Level,Chest_Pain_Type,Thalassemia,Fasting_Blood_Sugar,ECG_Results,Exercise_Induced_Angina,Max_Heart_Rate_Achieved,Heart_Attack_Risk
i64,str,i64,i64,str,f64,i64,i64,f64,i64,i64,i64,str,str,str,i64,str,i64,i64,str
69,"""Female""",1,0,"""Moderate""",34.61,1,0,152.1,171,85,0,"""Moderate""","""Non-anginal""","""Reversible defect""",0,"""Normal""",0,114,"""Low"""
32,"""Male""",0,0,"""Moderate""",22.75,0,0,166.8,126,103,0,"""Low""","""Asymptomatic""","""Normal""",0,"""ST-T abnormality""",0,173,"""Moderate"""
89,"""Male""",0,1,"""Moderate""",35.32,0,0,272.3,123,127,0,"""Low""","""Typical""","""Reversible defect""",0,"""ST-T abnormality""",0,109,"""Low"""
78,"""Male""",0,1,"""Moderate""",18.23,1,0,237.7,144,125,0,"""Low""","""Typical""","""Fixed defect""",1,"""Left Ventricular Hypertrophy""",0,129,"""Low"""
38,"""Female""",1,0,"""Moderate""",19.82,0,0,207.7,123,107,0,"""High""","""Asymptomatic""","""Reversible defect""",0,"""ST-T abnormality""",0,124,"""Moderate"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
21,"""Male""",0,0,"""Low""",39.93,0,0,269.9,171,113,0,"""High""","""Typical""","""Reversible defect""",0,"""ST-T abnormality""",0,122,"""High"""
35,"""Female""",0,0,"""Low""",18.1,0,0,235.8,146,71,0,"""Moderate""","""Non-anginal""","""Fixed defect""",1,"""ST-T abnormality""",0,121,"""Moderate"""
46,"""Male""",0,1,"""High""",21.42,0,0,172.8,146,85,1,"""Low""","""Typical""","""Fixed defect""",0,"""Left Ventricular Hypertrophy""",0,125,"""Low"""
56,"""Male""",0,1,"""Low""",29.93,0,0,244.1,151,110,0,"""Low""","""Asymptomatic""","""Reversible defect""",0,"""Normal""",0,149,"""Moderate"""


### Display First Few Rows to Understand Structure of Data

In [15]:
print(df.head())

shape: (5, 20)
┌─────┬────────┬─────────┬─────────────┬───┬─────────────┬─────────────┬─────────────┬─────────────┐
│ Age ┆ Gender ┆ Smoking ┆ Alcohol_Con ┆ … ┆ ECG_Results ┆ Exercise_In ┆ Max_Heart_R ┆ Heart_Attac │
│ --- ┆ ---    ┆ ---     ┆ sumption    ┆   ┆ ---         ┆ duced_Angin ┆ ate_Achieve ┆ k_Risk      │
│ i64 ┆ str    ┆ i64     ┆ ---         ┆   ┆ str         ┆ a           ┆ d           ┆ ---         │
│     ┆        ┆         ┆ i64         ┆   ┆             ┆ ---         ┆ ---         ┆ str         │
│     ┆        ┆         ┆             ┆   ┆             ┆ i64         ┆ i64         ┆             │
╞═════╪════════╪═════════╪═════════════╪═══╪═════════════╪═════════════╪═════════════╪═════════════╡
│ 69  ┆ Female ┆ 1       ┆ 0           ┆ … ┆ Normal      ┆ 0           ┆ 114         ┆ Low         │
│ 32  ┆ Male   ┆ 0       ┆ 0           ┆ … ┆ ST-T        ┆ 0           ┆ 173         ┆ Moderate    │
│     ┆        ┆         ┆             ┆   ┆ abnormality ┆             ┆    

### Retrieve Basic Information About DataFrame

In [16]:
print(df.shape)
print(df.dtypes)

(50000, 20)
[Int64, String, Int64, Int64, String, Float64, Int64, Int64, Float64, Int64, Int64, Int64, String, String, String, Int64, String, Int64, Int64, String]


### Display Summary Statistics for All Columns

In [17]:
summary = df.describe()
print(summary)

shape: (9, 21)
┌────────────┬───────────┬────────┬──────────┬───┬────────────┬────────────┬───────────┬───────────┐
│ statistic  ┆ Age       ┆ Gender ┆ Smoking  ┆ … ┆ ECG_Result ┆ Exercise_I ┆ Max_Heart ┆ Heart_Att │
│ ---        ┆ ---       ┆ ---    ┆ ---      ┆   ┆ s          ┆ nduced_Ang ┆ _Rate_Ach ┆ ack_Risk  │
│ str        ┆ f64       ┆ str    ┆ f64      ┆   ┆ ---        ┆ ina        ┆ ieved     ┆ ---       │
│            ┆           ┆        ┆          ┆   ┆ str        ┆ ---        ┆ ---       ┆ str       │
│            ┆           ┆        ┆          ┆   ┆            ┆ f64        ┆ f64       ┆           │
╞════════════╪═══════════╪════════╪══════════╪═══╪════════════╪════════════╪═══════════╪═══════════╡
│ count      ┆ 50000.0   ┆ 50000  ┆ 50000.0  ┆ … ┆ 50000      ┆ 50000.0    ┆ 50000.0   ┆ 50000     │
│ null_count ┆ 0.0       ┆ 0      ┆ 0.0      ┆ … ┆ 0          ┆ 0.0        ┆ 0.0       ┆ 0         │
│ mean       ┆ 53.3987   ┆ null   ┆ 0.29842  ┆ … ┆ null       ┆ 0.20164    ┆

### Find Longest Text Length in Each Column

In [18]:
# Create an empty list to store max lengths for each string column
longest_text_lengths = []

# Loop through the columns to check for string columns
string_columns = [col for col in df.columns if df[col].dtype == pl.Utf8]

max_lengths = {}
for col in string_columns:
    max_length = df.select(pl.col(col).str.len_chars().max()).to_numpy()[0, 0]
    max_lengths[col] = max_length

df_max_lengths = pl.DataFrame(max_lengths)

df_max_lengths

Gender,Physical_Activity_Level,Stress_Level,Chest_Pain_Type,Thalassemia,ECG_Results,Heart_Attack_Risk
u32,u32,u32,u32,u32,u32,u32
6,8,8,12,17,28,8


### Retrieve Data Types of All Columns

In [19]:
print("Column data types:\n", df.dtypes)

Column data types:
 [Int64, String, Int64, Int64, String, Float64, Int64, Int64, Float64, Int64, Int64, Int64, String, String, String, Int64, String, Int64, Int64, String]


### Count Unique Values in Each Column

In [20]:
all_columns = [col for col in df.columns]

for col in all_columns:
    unique_counts = df[col].n_unique()
    print(f"Unique values in {col} :".rjust(48), f"{unique_counts}".ljust(6))

                          Unique values in Age : 72    
                       Unique values in Gender : 2     
                      Unique values in Smoking : 2     
          Unique values in Alcohol_Consumption : 2     
      Unique values in Physical_Activity_Level : 3     
                          Unique values in BMI : 2501  
                     Unique values in Diabetes : 2     
                 Unique values in Hypertension : 2     
            Unique values in Cholesterol_Level : 1501  
                   Unique values in Resting_BP : 90    
                   Unique values in Heart_Rate : 70    
               Unique values in Family_History : 2     
                 Unique values in Stress_Level : 3     
              Unique values in Chest_Pain_Type : 4     
                  Unique values in Thalassemia : 3     
          Unique values in Fasting_Blood_Sugar : 2     
                  Unique values in ECG_Results : 3     
      Unique values in Exercise_Induced_Angina :

### Check Distribution of Numerical Columns

In [21]:
numerical_cols = [item for item in all_columns if item not in string_columns]
numerical_cols = [item for item in numerical_cols if item not in ['id']]

numerical_cols

for col in numerical_cols:
    distribution = df.select(col).describe()
    print(col)
    print(distribution, '\n\n')

Age
shape: (9, 2)
┌────────────┬───────────┐
│ statistic  ┆ Age       │
│ ---        ┆ ---       │
│ str        ┆ f64       │
╞════════════╪═══════════╡
│ count      ┆ 50000.0   │
│ null_count ┆ 0.0       │
│ mean       ┆ 53.3987   │
│ std        ┆ 20.799006 │
│ min        ┆ 18.0      │
│ 25%        ┆ 35.0      │
│ 50%        ┆ 53.0      │
│ 75%        ┆ 71.0      │
│ max        ┆ 89.0      │
└────────────┴───────────┘ 


Smoking
shape: (9, 2)
┌────────────┬──────────┐
│ statistic  ┆ Smoking  │
│ ---        ┆ ---      │
│ str        ┆ f64      │
╞════════════╪══════════╡
│ count      ┆ 50000.0  │
│ null_count ┆ 0.0      │
│ mean       ┆ 0.29842  │
│ std        ┆ 0.457569 │
│ min        ┆ 0.0      │
│ 25%        ┆ 0.0      │
│ 50%        ┆ 0.0      │
│ 75%        ┆ 1.0      │
│ max        ┆ 1.0      │
└────────────┴──────────┘ 


Alcohol_Consumption
shape: (9, 2)
┌────────────┬─────────────────────┐
│ statistic  ┆ Alcohol_Consumption │
│ ---        ┆ ---                 │
│ str        ┆

## Time for the Pandas Work

### Ingest Dataset into Pandas Dataframe

In [22]:
df = pd.read_csv("data/dataset.csv")

df

Unnamed: 0,Age,Gender,Smoking,Alcohol_Consumption,Physical_Activity_Level,BMI,Diabetes,Hypertension,Cholesterol_Level,Resting_BP,Heart_Rate,Family_History,Stress_Level,Chest_Pain_Type,Thalassemia,Fasting_Blood_Sugar,ECG_Results,Exercise_Induced_Angina,Max_Heart_Rate_Achieved,Heart_Attack_Risk
0,69,Female,1,0,Moderate,34.61,1,0,152.1,171,85,0,Moderate,Non-anginal,Reversible defect,0,Normal,0,114,Low
1,32,Male,0,0,Moderate,22.75,0,0,166.8,126,103,0,Low,Asymptomatic,Normal,0,ST-T abnormality,0,173,Moderate
2,89,Male,0,1,Moderate,35.32,0,0,272.3,123,127,0,Low,Typical,Reversible defect,0,ST-T abnormality,0,109,Low
3,78,Male,0,1,Moderate,18.23,1,0,237.7,144,125,0,Low,Typical,Fixed defect,1,Left Ventricular Hypertrophy,0,129,Low
4,38,Female,1,0,Moderate,19.82,0,0,207.7,123,107,0,High,Asymptomatic,Reversible defect,0,ST-T abnormality,0,124,Moderate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,21,Male,0,0,Low,39.93,0,0,269.9,171,113,0,High,Typical,Reversible defect,0,ST-T abnormality,0,122,High
49996,35,Female,0,0,Low,18.10,0,0,235.8,146,71,0,Moderate,Non-anginal,Fixed defect,1,ST-T abnormality,0,121,Moderate
49997,46,Male,0,1,High,21.42,0,0,172.8,146,85,1,Low,Typical,Fixed defect,0,Left Ventricular Hypertrophy,0,125,Low
49998,56,Male,0,1,Low,29.93,0,0,244.1,151,110,0,Low,Asymptomatic,Reversible defect,0,Normal,0,149,Moderate


### Retrieve Dictionary of Column Names & Data Types

In [23]:
df.dtypes.apply(str).to_dict()

{'Age': 'int64',
 'Gender': 'object',
 'Smoking': 'int64',
 'Alcohol_Consumption': 'int64',
 'Physical_Activity_Level': 'object',
 'BMI': 'float64',
 'Diabetes': 'int64',
 'Hypertension': 'int64',
 'Cholesterol_Level': 'float64',
 'Resting_BP': 'int64',
 'Heart_Rate': 'int64',
 'Family_History': 'int64',
 'Stress_Level': 'object',
 'Chest_Pain_Type': 'object',
 'Thalassemia': 'object',
 'Fasting_Blood_Sugar': 'int64',
 'ECG_Results': 'object',
 'Exercise_Induced_Angina': 'int64',
 'Max_Heart_Rate_Achieved': 'int64',
 'Heart_Attack_Risk': 'object'}

### Retrieve Longest String Value For Non-Numeric Columns

In [24]:
def longest_string_length(df):
    # Select non-numeric columns
    non_numeric_cols = df.select_dtypes(exclude=['number', 'bool'])
    # Calculate the length of the longest string for each column
    return non_numeric_cols.apply(lambda col: col.dropna().astype(str).map(len).max())

# Get the result
longest_lengths = longest_string_length(df)

print(longest_lengths)

Gender                      6
Physical_Activity_Level     8
Stress_Level                8
Chest_Pain_Type            12
Thalassemia                17
ECG_Results                28
Heart_Attack_Risk           8
dtype: int64


### Function To Convert Pandas Dataframe to Postgres INSERT Statements

In [25]:
def convert_df_to_postgres_insert(df, table_name):
    """
    Converts a pandas DataFrame into PostgreSQL INSERT statements.

    Args:
        df (pd.DataFrame): The DataFrame to convert.
        table_name (str): The name of the Postgres table.

    Returns:
        str: The generated INSERT statements as a single string.
    """
    # Escape single quotes in string values
    def escape_value(value):
        if isinstance(value, str):
            return value.replace("'", "''")
        return value

    # Generate INSERT statements
    insert_statements = []
    for _, row in df.iterrows():
        values = []
        for value in row:
            if pd.isnull(value):  # Handle NULL values
                values.append("NULL")
            elif isinstance(value, str):
                values.append(f"'{escape_value(value)}'")
            elif isinstance(value, (int, float)):
                values.append(str(value))
            elif isinstance(value, bool):
                values.append('TRUE' if value else 'FALSE')
            else:
                raise ValueError(f"Unsupported data type: {type(value)}")
        insert_statements.append(f"INSERT INTO {table_name} VALUES ({', '.join(values)});")
    
    return "\n".join(insert_statements)

### Function To Save INSERT Statements to File

In [26]:
def write_insert_statements_to_file(insert_sql, file_path):
    """
    Writes the given INSERT statements to a specified file.

    Args:
        insert_sql (str): The INSERT statements as a string.
        file_path (str): The path of the file to write to.
    """
    try:
        with open(file_path, 'w') as file:
            file.write(insert_sql)
        print(f"INSERT statements written to {file_path}")
    except Exception as e:
        print(f"An error occurred while writing to the file: {e}")

### Convert DataFrame to INSERT statements

In [27]:
table_name = "ha_risk_pg_table"
insert_sql = convert_df_to_postgres_insert(df, table_name)
print(insert_sql)

INSERT INTO ha_risk_pg_table VALUES (69, 'Female', 1, 0, 'Moderate', 34.61, 1, 0, 152.1, 171, 85, 0, 'Moderate', 'Non-anginal', 'Reversible defect', 0, 'Normal', 0, 114, 'Low');
INSERT INTO ha_risk_pg_table VALUES (32, 'Male', 0, 0, 'Moderate', 22.75, 0, 0, 166.8, 126, 103, 0, 'Low', 'Asymptomatic', 'Normal', 0, 'ST-T abnormality', 0, 173, 'Moderate');
INSERT INTO ha_risk_pg_table VALUES (89, 'Male', 0, 1, 'Moderate', 35.32, 0, 0, 272.3, 123, 127, 0, 'Low', 'Typical', 'Reversible defect', 0, 'ST-T abnormality', 0, 109, 'Low');
INSERT INTO ha_risk_pg_table VALUES (78, 'Male', 0, 1, 'Moderate', 18.23, 1, 0, 237.7, 144, 125, 0, 'Low', 'Typical', 'Fixed defect', 1, 'Left Ventricular Hypertrophy', 0, 129, 'Low');
INSERT INTO ha_risk_pg_table VALUES (38, 'Female', 1, 0, 'Moderate', 19.82, 0, 0, 207.7, 123, 107, 0, 'High', 'Asymptomatic', 'Reversible defect', 0, 'ST-T abnormality', 0, 124, 'Moderate');
INSERT INTO ha_risk_pg_table VALUES (41, 'Male', 0, 1, 'Moderate', 36.11, 0, 0, 271.2, 141,

### Write INSERT Statements to File

In [28]:
file_path = "./postgres/3_insert_statements.sql"
write_insert_statements_to_file(insert_sql, file_path)

INSERT statements written to ./postgres/3_insert_statements.sql
