CSC6621 Mini Lab 2
Teacher: Dr. 
Student: Arsalon Amini
4/1/2024

In [81]:
# Utils 
import os
import pandas as pd
import numpy as np

# Sci-Kit Learn Libraries - Pre Processing 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Sci-Kit Learn Libraries - Model Fitting 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Pipelines 
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [82]:
# Import Data Set 


# Data put into the root directory
data_folder = "data"

# Data set consists of multiple files 
file_names = ["processed.cleveland.data", "processed.hungarian.data", "processed.switzerland.data", "processed.longbeach.data"]

# List to store DataFrames for each file
dfs = []

# Iterate through each file
for file_name in file_names:
    # Construct the full file path
    file_path = os.path.join(data_folder, file_name)
    
    # Read the data into a DataFrame
    df = pd.read_csv(file_path, header=None)
    
    # Display head and info
    print(f"Head of {file_name}:")
    print(df.head())
    print("\nInfo of {file_name}:")
    print(df.info())
    print("\n")
    
    # Append the DataFrame to the list
    dfs.append(df)

# Combine the DataFrames into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

# Display head and info of the combined DataFrame
print("Head of combined DataFrame:")
print(combined_df.head())
print("\nInfo of combined DataFrame:")
print(combined_df.info())




Head of processed.cleveland.data:
     0    1    2      3      4    5    6      7    8    9    10   11   12  13
0  63.0  1.0  1.0  145.0  233.0  1.0  2.0  150.0  0.0  2.3  3.0  0.0  6.0   0
1  67.0  1.0  4.0  160.0  286.0  0.0  2.0  108.0  1.0  1.5  2.0  3.0  3.0   2
2  67.0  1.0  4.0  120.0  229.0  0.0  2.0  129.0  1.0  2.6  2.0  2.0  7.0   1
3  37.0  1.0  3.0  130.0  250.0  0.0  0.0  187.0  0.0  3.5  3.0  0.0  3.0   0
4  41.0  0.0  2.0  130.0  204.0  0.0  2.0  172.0  0.0  1.4  1.0  0.0  3.0   0

Info of {file_name}:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       303 non-null    float64
 1   1       303 non-null    float64
 2   2       303 non-null    float64
 3   3       303 non-null    float64
 4   4       303 non-null    float64
 5   5       303 non-null    float64
 6   6       303 non-null    float64
 7   7       303 non-null    float64
 

In [83]:
# Add human readible labels (Provided by supporting documentation)

column_names = {
    0: 'age',
    1: 'sex',
    2: 'chest_pain',
    3: 'rest_bps',
    4: 'cholesterol',
    5: 'fast_blood_sugar',
    6: 'rest_ecg',
    7: 'max_hr_achieved',
    8: 'excercise_induced_angina',
    9: 'excercise_induced_ecg_ST_val',
    10: 'peak_excercise_ST_slope',
    11: 'major_vessles_colored',
    12: 'thal',
    13: 'heart_disease_diagnosis'
}

# Rename columns using human readible labels
combined_df.rename(columns=column_names, inplace=True)

print(combined_df.head())
print(combined_df.info())

    age  sex  chest_pain rest_bps cholesterol fast_blood_sugar rest_ecg  \
0  63.0  1.0         1.0    145.0       233.0              1.0      2.0   
1  67.0  1.0         4.0    160.0       286.0              0.0      2.0   
2  67.0  1.0         4.0    120.0       229.0              0.0      2.0   
3  37.0  1.0         3.0    130.0       250.0              0.0      0.0   
4  41.0  0.0         2.0    130.0       204.0              0.0      2.0   

  max_hr_achieved excercise_induced_angina excercise_induced_ecg_ST_val  \
0           150.0                      0.0                          2.3   
1           108.0                      1.0                          1.5   
2           129.0                      1.0                          2.6   
3           187.0                      0.0                          3.5   
4           172.0                      0.0                          1.4   

  peak_excercise_ST_slope major_vessles_colored thal  heart_disease_diagnosis  
0                 

In [84]:
# Handle Missing Values

for column in combined_df.columns:
    # Count '?' values in the current column
    question_mark_count = (combined_df[column] == '?').sum()
    # Print the column name and its count of '?' values
    print(f"Column '{column}': {question_mark_count} '?' values")

cleaned_df = combined_df.copy()

cleaned_df.replace('?', np.nan, inplace=True) # replaces all occurances of '?' with NaN across entire

print('********************************')
print('Counts after removing ?')
for column in cleaned_df.columns:
    # Post Removal of "?"
    question_mark_count = (cleaned_df[column] == '?').sum()
    # Print the column name and its count of '?' values
    print(f"Column '{column}': {question_mark_count} '?' values")
    
    
print(cleaned_df.info())

Column 'age': 0 '?' values
Column 'sex': 0 '?' values
Column 'chest_pain': 0 '?' values
Column 'rest_bps': 59 '?' values
Column 'cholesterol': 30 '?' values
Column 'fast_blood_sugar': 90 '?' values
Column 'rest_ecg': 2 '?' values
Column 'max_hr_achieved': 55 '?' values
Column 'excercise_induced_angina': 55 '?' values
Column 'excercise_induced_ecg_ST_val': 62 '?' values
Column 'peak_excercise_ST_slope': 309 '?' values
Column 'major_vessles_colored': 611 '?' values
Column 'thal': 486 '?' values
Column 'heart_disease_diagnosis': 0 '?' values
********************************
Counts after removing ?
Column 'age': 0 '?' values
Column 'sex': 0 '?' values
Column 'chest_pain': 0 '?' values
Column 'rest_bps': 0 '?' values
Column 'cholesterol': 0 '?' values
Column 'fast_blood_sugar': 0 '?' values
Column 'rest_ecg': 0 '?' values
Column 'max_hr_achieved': 0 '?' values
Column 'excercise_induced_angina': 0 '?' values
Column 'excercise_induced_ecg_ST_val': 0 '?' values
Column 'peak_excercise_ST_slope'

In [85]:
# Exploring the target variable 

mean_heart_disease_diagnosis = cleaned_df['heart_disease_diagnosis'].mean()
mode_heart_disease_diagnosis = cleaned_df['heart_disease_diagnosis'].mode()
counts_heart_disease_diagnosis = cleaned_df['heart_disease_diagnosis'].value_counts()
print("mean of HD", mean_heart_disease_diagnosis)
print("mode of HD", mode_heart_disease_diagnosis)
print("counts of HD", counts_heart_disease_diagnosis)

# first_25_entries = cleaned_df[cleaned_df['heart_disease_diagnosis'].isin(range(5))].head(25)

# Display the first 25 entries
# print(first_25_entries)



mean of HD 0.9956521739130435
mode of HD 0    0
Name: heart_disease_diagnosis, dtype: int64
counts of HD heart_disease_diagnosis
0    411
1    265
2    109
3    107
4     28
Name: count, dtype: int64


In [86]:
# Converting target into a binary classification problem

cleaned_df['heart_disease_binary'] = cleaned_df['heart_disease_diagnosis'].apply(lambda x: 1 if x > 0 else 0)
cleaned_df.drop(columns=['heart_disease_diagnosis'], inplace=True)


counts_heart_disease_binary = cleaned_df['heart_disease_binary'].value_counts()
print("counts of HDB", counts_heart_disease_binary)

print(cleaned_df.info())

counts of HDB heart_disease_binary
1    509
0    411
Name: count, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   age                           920 non-null    float64
 1   sex                           920 non-null    float64
 2   chest_pain                    920 non-null    float64
 3   rest_bps                      861 non-null    object 
 4   cholesterol                   890 non-null    object 
 5   fast_blood_sugar              830 non-null    object 
 6   rest_ecg                      918 non-null    object 
 7   max_hr_achieved               865 non-null    object 
 8   excercise_induced_angina      865 non-null    object 
 9   excercise_induced_ecg_ST_val  858 non-null    object 
 10  peak_excercise_ST_slope       611 non-null    object 
 11  major_vessles_colored         309 non-null  

In [89]:
# Data Cleaning - Handling Missing Data - Imputing NaN 

# Define numeric and categorical columns
numeric_features = ['age', 'rest_bps', 'cholesterol', 'max_hr_achieved', 'excercise_induced_ecg_ST_val']
categorical_features = ['sex','rest_ecg', 'fast_blood_sugar', 'excercise_induced_angina', 'chest_pain','excercise_induced_ecg_ST_val', 'peak_excercise_ST_slope', 'major_vessles_colored', 'thal', 'heart_disease_binary']


# Handle NaN in Numeric Columns by imputing the column mean into missing values
numeric_imputer = SimpleImputer(strategy='mean')
cleaned_df[numeric_features] = numeric_imputer.fit_transform(cleaned_df[numeric_features])


# Handle NaN in Categorical Columns by imputing the column mode 
categorical_imputer = SimpleImputer(strategy='most_frequent')
cleaned_df[categorical_features] = categorical_imputer.fit_transform(cleaned_df[categorical_features])

print(cleaned_df.info())
print(cleaned_df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   age                           920 non-null    float64
 1   sex                           920 non-null    object 
 2   chest_pain                    920 non-null    object 
 3   rest_bps                      920 non-null    float64
 4   cholesterol                   920 non-null    float64
 5   fast_blood_sugar              920 non-null    object 
 6   rest_ecg                      920 non-null    object 
 7   max_hr_achieved               920 non-null    float64
 8   excercise_induced_angina      920 non-null    object 
 9   excercise_induced_ecg_ST_val  920 non-null    object 
 10  peak_excercise_ST_slope       920 non-null    object 
 11  major_vessles_colored         920 non-null    object 
 12  thal                          920 non-null    object 
 13  heart

In [93]:
# Encoding Categorical Variables before Model Fitting 

cleaned_df = pd.get_dummies(cleaned_df, columns=['sex', 'chest_pain', 'fast_blood_sugar', 'rest_ecg', 'excercise_induced_angina', 'excercise_induced_ecg_ST_val', 'peak_excercise_ST_slope', 'major_vessles_colored', 'thal'])

# Convert heart_disease_binary to numerical values
cleaned_df['heart_disease_binary'] = cleaned_df['heart_disease_binary'].astype(int)

print(cleaned_df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 98 columns):
 #   Column                                           Non-Null Count  Dtype  
---  ------                                           --------------  -----  
 0   age                                              920 non-null    float64
 1   rest_bps                                         920 non-null    float64
 2   cholesterol                                      920 non-null    float64
 3   max_hr_achieved                                  920 non-null    float64
 4   heart_disease_binary                             920 non-null    int64  
 5   sex_0.0                                          920 non-null    bool   
 6   sex_1.0                                          920 non-null    bool   
 7   chest_pain_1.0                                   920 non-null    bool   
 8   chest_pain_2.0                                   920 non-null    bool   
 9   chest_pain_3.0                  

In [94]:
# Baseline Model 

# Splitting the data into features and target
X = cleaned_df.drop(columns=['heart_disease_binary'])
y = cleaned_df['heart_disease_binary']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Init and train the logistic regression classifier
classifier = LogisticRegression(max_iter=1000)  
classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test)

# Calculating accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)



Accuracy: 0.8097826086956522


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [61]:
# Selecting the columns to scale
columns_to_scale = ['rest_bps', 'cholesterol', 'excercise_induced_ecg_ST_val']

# Applying StandardScaler only to selected columns
scaler = StandardScaler()
X_scaled = encoded_df.copy()
X_scaled[columns_to_scale] = scaler.fit_transform(X_scaled[columns_to_scale])

# Splitting the data into features and target
X = X_scaled.drop(columns=['heart_disease_diagnosis'])
y = X_scaled['heart_disease_diagnosis']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the logistic regression classifier
classifier = LogisticRegression(max_iter=10000000)  
classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test)

# Calculating accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.5380434782608695


In [100]:
# Feature Engineering

df_fe = combined_df.copy()
print(df_fe.info())

# Feature 1: Interaction Term between Cholesterol and Max Heart Rate Achieved
df_fe['cholesterol_max_hr_interaction'] = df_fe['cholesterol'] * df_fe['max_hr_achieved']

# Feature 2: Ratio of Major Vessels Colored to Cholesterol
df_fe['vessels_to_cholesterol_ratio'] = df_fe['major_vessles_colored'] / df_fe['cholesterol']

# Display the engineered DataFrame
print(df_fe.head())
print(df_fe.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   age                           920 non-null    float64
 1   sex                           920 non-null    float64
 2   chest_pain                    920 non-null    float64
 3   rest_bps                      861 non-null    object 
 4   cholesterol                   890 non-null    object 
 5   fast_blood_sugar              830 non-null    object 
 6   rest_ecg                      918 non-null    object 
 7   max_hr_achieved               865 non-null    object 
 8   excercise_induced_angina      865 non-null    object 
 9   excercise_induced_ecg_ST_val  858 non-null    object 
 10  peak_excercise_ST_slope       611 non-null    object 
 11  major_vessles_colored         309 non-null    object 
 12  thal                          434 non-null    object 
 13  heart

TypeError: can't multiply sequence by non-int of type 'str'

In [80]:
# Split the data into features (X) and target (y)
X = df_fe.drop(columns=['heart_disease_diagnosis'])
y = df_fe['heart_disease_diagnosis']

# Define categorical and numerical features
numerical_features = ['cholesterol_max_hr_interaction', 'chest_pain_rest_ecg_interaction', 'vessels_to_cholesterol_ratio']

# Define preprocessing steps for numerical and categorical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Impute missing values with median
    ('scaler', StandardScaler())  # Scale numerical features
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical features
])

# Combine preprocessing steps for both numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Define the logistic regression model
classifier = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Preprocessing
    ('classifier', LogisticRegression(max_iter=1000))  # Logistic regression model
])

# Train the logistic regression model
classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

ValueError: Input X contains infinity or a value too large for dtype('float64').