In [158]:
import importlib

import myutils
importlib.reload(myutils)
import myutils as myutils
import evaluation
importlib.reload(evaluation)

import mypytable
importlib.reload(mypytable)
from mypytable import MyPyTable 

import myclassifiers
importlib.reload(myclassifiers)
from myclassifiers import MyNaiveBayesClassifier,\
    MyDummyClassifier

import numpy as np

# Analyzing Health Risk Factors 
### Research Leaders: Sofia Verdie, Fiona Callahan 
##### **CPSC 322, Fall 2025**

### Introduction 

* describes dataset and classification task implemented 
* briefly describe findings 

### Exploratory Data Analysis 
* info about data 
* summary stats 
* data visualizations 

### Data Preprocessing 

**Our current dataset: full of 30,000 records**
    * We will be pairing this down to 5,000 random samples during classification 

In [159]:
# load the data into file 
our_data = MyPyTable().load_from_file("input_data/healthData.csv")

length, width = our_data.get_shape()
print("Length: ", length)
print("Width: ", width)

# cleaning data: remove rows with missing values in the "Gender" column, as well as the CLass label column 
our_data.remove_rows_with_missing_values("Gender")
our_data.remove_rows_with_missing_values("Medical Condition")

# replacing other missing values with column avergae: need to do for other columns? 
our_data.replace_missing_values_with_column_average("Glucose")
our_data.replace_missing_values_with_column_average("Blood Pressure")
our_data.replace_missing_values_with_column_average("Cholesterol")


length, width = our_data.get_shape()
print("Length: ", length)
print("Width: ", width)


Length:  30000
Width:  20
Length:  21706
Width:  20


In [160]:
# summary stats
# can compute based on certain columns: EX: age, blood pressure, etc... 
# need to add more 
summary_stats = our_data.compute_summary_statistics(["Age", "LengthOfStay", "Glucose", "Cholesterol", "Blood Pressure"])
summary_stats.pretty_print()

attribute         min     max      mid        avg    median
--------------  -----  ------  -------  ---------  --------
Age             10      89      49.5     54.6841     55
LengthOfStay     1      19      10        4.41173     4
Glucose         20.32  318.51  169.415  123.735     116.78
Cholesterol     95.73  355.27  225.5    213.064     211.76
Blood Pressure  74.24  226.38  150.31   140.539     140.539


In [161]:
# reducing size of instances to 5000 for classification 

np.random.seed(0)
scaled_down_indexes = np.random.choice(21706, size=5000, replace=False)
table = []
for i in range(len(our_data.data)):
    if i in scaled_down_indexes:
        table.append(our_data.data[i])

# data set is now full of 5000 instances [scaled down] - to classify on 
print(len(table))

5000


### Classification Results 

* using dummy, naive bayes, random forest 

Keeping Attributes: Age, Gender, Blood Pressure, BMI, Length of Stay, Family History, Cholesterol, Oxygen Saturation, Physical Activity 

In [None]:
# seems to be most important attributes: but produces horrible accuracy, so scale down ! 

keep = ["Age", "Gender", "Glucose", "Blood Pressure", "BMI", "Oxygen Saturation", "LengthOfStay", "Cholesterol", "Physical Activity", "Medical Condition"]
# keep = ["Blood Pressure", "LengthOfStay", "Cholesterol", "Medical Condition"]
keep_indexes = []
for i in keep:
    column_index = our_data.column_names.index(i)
    keep_indexes.append(column_index)

# make new data table of just the indexes we want 
# new structure: the keep list [use in future as header]
new_table = [[row[j] for j in keep_indexes] for row in table]


bmi_index = keep.index("BMI")
bp_index = keep.index("Blood Pressure")
glucose_index = keep.index("Glucose")

# now need to discretize: look at utils 
for row in new_table:
    row[bmi_index] = myutils.bin_bmi(float(row[bmi_index]))
    row[bp_index]  = myutils.bin_bp(float(row[bp_index]))
    row[glucose_index] = myutils.bin_glucose(float(row[glucose_index]))

# do i need to normalize? 





# train/test split 
# making y_train and y_test 
y = myutils.make_y_col_lists(keep, "Medical Condition", new_table)
X = [row[:-1] for row in new_table]
X_train, X_test, y_train, y_test = evaluation.train_test_split(X, y)


# dummy classifier: 
dummy_clf = MyDummyClassifier()
dummy_clf.fit(X_train, y_train)


# naive bayes:
naive_bayes = MyNaiveBayesClassifier()
naive_bayes.fit(X_train, y_train)

# now need to do random forest 
#random_forest = MyRandomForestClassifier()
#random_forest.fit(X_train, y_train)

    

ValueError: 'BMI' is not in list

In [None]:
# now time to predict: 
d_accuracy, d_error_rate, y_true_dummy, y_pred_dummy = myutils.cross_val_predict(X_train, y_train, dummy_clf)
print("Dummy Classifier: accuracy = ", d_accuracy, "error rate = ", d_error_rate)

n_accuracy, n_error_rate, n_true, n_pred = myutils.cross_val_predict(X_train, y_train, naive_bayes)
print("Naive Bayes Classifier: accuracy = ", n_accuracy, "error rate = ", n_error_rate)

#r_accuracy, r_error_rate, r_true, r_pred = myutils.cross_val_predict(X_train, y_train, random_forest)
#print("Random Forest Classifier: accuracy = ", r_accuracy, "error rate = ", r_error_rate)

Dummy Classifier: accuracy =  0.2883582089552239 error rate =  0.7116417910447761
Naive Bayes Classifier: accuracy =  0.014925373134328358 error rate =  0.9850746268656716


### Conclusion 

### Acknowledgements 