In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')

### Loading the data

In [3]:
df = pd.read_csv("FraminghamHeartStudy.csv")

df.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


### Data Processing (Handling missing values, feature engineering, etc.)

In [5]:
# Create 'sex' column from 'male'
df['gender'] = df['male'].apply(lambda x: 'Male' if x == 1 else 'Female')

# Optional: Drop the original 'male' column 
#df.drop('male', axis=1, inplace=True)

df.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD,gender
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0,Male
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0,Female
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0,Male
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1,Female
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0,Female


In [6]:
# Initial EDA for data quality
print("Basic Data Profile:")
print(df.info())

# Summary statistics
print(df.describe())

# Identifying duplicate records and removing them if found
duplicates = df.duplicated().sum()
print(f"\nNumber of duplicate records: {duplicates}")
if duplicates > 0:
    df = df.drop_duplicates()
    print("Duplicate records removed.")

print("\nMissing Values:")
print(df.isnull().sum())


Basic Data Profile:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4240 entries, 0 to 4239
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   male             4240 non-null   int64  
 1   age              4240 non-null   int64  
 2   education        4135 non-null   float64
 3   currentSmoker    4240 non-null   int64  
 4   cigsPerDay       4211 non-null   float64
 5   BPMeds           4187 non-null   float64
 6   prevalentStroke  4240 non-null   int64  
 7   prevalentHyp     4240 non-null   int64  
 8   diabetes         4240 non-null   int64  
 9   totChol          4190 non-null   float64
 10  sysBP            4240 non-null   float64
 11  diaBP            4240 non-null   float64
 12  BMI              4221 non-null   float64
 13  heartRate        4239 non-null   float64
 14  glucose          3852 non-null   float64
 15  TenYearCHD       4240 non-null   int64  
 16  gender           4240 non-null   object 

In [7]:
# Impute 'education' with its mode
# The .mode() method returns a Series, so we select the first item with [0]
education_mode = df['education'].mode()[0]
df['education'].fillna(education_mode, inplace=True)
print(f"Filled missing 'education' values with mode: {education_mode}")

# Impute 'BPMeds' with its mode
bp_meds_mode = df['BPMeds'].mode()[0]
df['BPMeds'].fillna(bp_meds_mode, inplace=True)
print(f"Filled missing 'BPMeds' values with mode: {bp_meds_mode}")

# Create a list of the continuous columns that have missing values
continuous_vars_with_na = ['cigsPerDay', 'totChol', 'BMI', 'heartRate', 'glucose']

# Loop through the columns and fill missing values with the median of that column
for col in continuous_vars_with_na:
    col_median = df[col].median()
    df[col].fillna(col_median, inplace=True)
    print(f"Filled missing '{col}' values with median: {col_median}")

print("\nMissing Values:")
print(df.isnull().sum())

Filled missing 'education' values with mode: 1.0
Filled missing 'BPMeds' values with mode: 0.0
Filled missing 'cigsPerDay' values with median: 0.0
Filled missing 'totChol' values with median: 234.0
Filled missing 'BMI' values with median: 25.4
Filled missing 'heartRate' values with median: 75.0
Filled missing 'glucose' values with median: 78.0

Missing Values:
male               0
age                0
education          0
currentSmoker      0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          0
glucose            0
TenYearCHD         0
gender             0
dtype: int64


In [8]:
# Define a function to classify blood pressure
def classify_bp(row):
    # Using standard clinical guidelines
    if row['sysBP'] < 120 and row['diaBP'] < 80:
        return 'Normal'
    elif 120 <= row['sysBP'] < 130 and row['diaBP'] < 80:
        return 'Elevated'
    elif 130 <= row['sysBP'] < 140 or 80 <= row['diaBP'] < 90:
        return 'Hypertension Stage 1'
    elif row['sysBP'] >= 140 or row['diaBP'] >= 90:
        return 'Hypertension Stage 2'
    else:
        return 'Other' # Catch any edge cases

# Apply the function to each row of the dataframe to create the new column
df['BP_Category'] = df.apply(classify_bp, axis=1)

print("\n--- Value Counts for New 'BP_Category' Feature ---")
print(df['BP_Category'].value_counts())


--- Value Counts for New 'BP_Category' Feature ---
BP_Category
Hypertension Stage 1    1805
Normal                  1033
Hypertension Stage 2     992
Elevated                 410
Name: count, dtype: int64


### Saving cleaning data

In [10]:
# Save the cleaned dataframe to a new file
df.to_csv('framingham_cleaned.csv', index=False)

print("\nSuccessfully cleaned the dataset and saved it to 'framingham_cleaned.csv'")


Successfully cleaned the dataset and saved it to 'framingham_cleaned.csv'


### Predictive Analysis: Baseline Logistic Regression Model

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, make_scorer
from sklearn.model_selection import cross_val_score, StratifiedKFold


features = ['age', 'cigsPerDay', 'totChol', 'BMI', 'male', 'currentSmoker', 'sysBP', 'diaBP']
X = df[features]
y = df['TenYearCHD']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize and train the Logistic Regression model
# use class_weight='balanced' because the dataset is imbalanced
# Define model
log_reg = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)

# Define Stratified K-Fold 
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Accuracy
acc_scores = cross_val_score(log_reg, X, y, cv=cv, scoring='accuracy')
print("Cross-validated Accuracy Scores:", acc_scores)
print("Mean Accuracy:", acc_scores.mean())

# AUC 
auc_scores = cross_val_score(log_reg, X, y, cv=cv, scoring='roc_auc')
print("Cross-validated AUC Scores:", auc_scores)
print("Mean AUC:", auc_scores.mean())


Cross-validated Accuracy Scores: [0.6745283  0.6509434  0.66745283 0.68042453 0.64504717]
Mean Accuracy: 0.6636792452830189
Cross-validated AUC Scores: [0.70602214 0.72494097 0.75039622 0.74569546 0.67493612]
Mean AUC: 0.7203981824067262


### Predictive Analysis: Random Forest Model

In [14]:
from sklearn.ensemble import RandomForestClassifier

# Initialize Random Forest
rf_clf = RandomForestClassifier(
    n_estimators=100, 
    class_weight='balanced', 
    random_state=42
)

# Fit
rf_clf.fit(X_train, y_train)

# Predict
y_pred_rf = rf_clf.predict(X_test)

# Evaluate
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))

# Feature importance
importances = pd.Series(rf_clf.feature_importances_, index=features)
print("\nFeature Importances:\n", importances.sort_values(ascending=False))


Random Forest Accuracy: 0.8455188679245284

Confusion Matrix:
 [[710   9]
 [122   7]]

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.99      0.92       719
           1       0.44      0.05      0.10       129

    accuracy                           0.85       848
   macro avg       0.65      0.52      0.51       848
weighted avg       0.79      0.85      0.79       848


Feature Importances:
 age              0.202729
sysBP            0.189440
BMI              0.176088
totChol          0.173931
diaBP            0.157330
cigsPerDay       0.063535
male             0.022856
currentSmoker    0.014091
dtype: float64
