In [1]:
#Import library
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib 

In [2]:
#Read the stroke data
data_stroke = pd.read_csv("data/stroke_data.csv")
#Read the diabetes data
data_diabete = pd.read_csv("data/diabetes_data.csv")
#Read the heart_attack data
data_heart_attack = pd.read_csv("data/heart_attack_data.csv")

In [3]:
# stroke columns selection
data_stroke = data_stroke[['age', 'gender', 'hypertension', 'heart_disease', 'ever_married', 
                    'work_type', 'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status','stroke']]

#diabetes columns selection
data_diabete = data_diabete[['Age','Glucose','BloodPressure','SkinThickness','Insulin','BMI','Outcome']]

# heart_attack columns selection
data_heart_attack = data_heart_attack[['Age','Sex','Cholesterol','Heart Rate','Diabetes','Smoking','Obesity','Alcohol Consumption','Diet','Heart Attack Risk']]

In [4]:
#Print out the infomation for each dataset 
datasets = [
    ('Stroke Data', data_stroke),
    ('Diabetes Data', data_diabete),
    ('Heart Attack Data', data_heart_attack)
]

# Print info for each dataset
for name, data in datasets:
    print("Info for", name)
    print(data.info())
    print()

Info for Stroke Data
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   age                5110 non-null   float64
 1   gender             5110 non-null   object 
 2   hypertension       5110 non-null   int64  
 3   heart_disease      5110 non-null   int64  
 4   ever_married       5110 non-null   object 
 5   work_type          5110 non-null   object 
 6   Residence_type     5110 non-null   object 
 7   avg_glucose_level  5110 non-null   float64
 8   bmi                4909 non-null   float64
 9   smoking_status     5110 non-null   object 
 10  stroke             5110 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 439.3+ KB
None

Info for Diabetes Data
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  --

In [5]:
# Check the NaN values in each dataset
def print_nan_counts(datasets):
    for i in range(len(datasets)):
        dataset_name, dataset = datasets[i]
        print(f"NaN counts for {dataset_name}:")
        print(dataset.isna().sum())
        print()
        
print_nan_counts(datasets)

NaN counts for Stroke Data:
age                    0
gender                 0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

NaN counts for Diabetes Data:
Age              0
Glucose          0
BloodPressure    0
SkinThickness    0
Insulin          0
BMI              0
Outcome          0
dtype: int64

NaN counts for Heart Attack Data:
Age                    0
Sex                    0
Cholesterol            0
Heart Rate             0
Diabetes               0
Smoking                0
Obesity                0
Alcohol Consumption    0
Diet                   0
Heart Attack Risk      0
dtype: int64



In [6]:
#fill the NaN values in the bmi column for stroke_data using mean imputation
print("Before filling:")
print(data_stroke['bmi'].isna().sum())

# Fill NaN values in the "bmi" column with the mean of the "bmi" column
data_stroke['bmi'].fillna(data_stroke['bmi'].mean(), inplace=True)

# Print NaN counts after filling
print("\nAfter filling:")
print(data_stroke['bmi'].isna().sum())


Before filling:
201

After filling:
0


In [7]:
#Using label encoding on categorical variables in 3 datasets.
def label_encode_dataset(data):
    "Write a function to check the categorical variables  using label enconding each categorical in the datasets"
    label_encoder = LabelEncoder()
    # Check if there are any categorical variables
    categorical_columns = data.select_dtypes(include=['object']).columns
    if len(categorical_columns) > 0:
        print("Label encoding the following columns:", categorical_columns)
        # Label encode each categorical column
        for column in categorical_columns:
            data[column] = label_encoder.fit_transform(data[column])
    else:
        print("No categorical variables found in the dataset.")

# Label encode each dataset
print("Label encoding Stroke Data:")
label_encode_dataset(data_stroke)

print("\nLabel encoding Diabetes Data:")
label_encode_dataset(data_diabete)

print("\nLabel encoding Heart Attack Data:")
label_encode_dataset(data_heart_attack)

Label encoding Stroke Data:
Label encoding the following columns: Index(['gender', 'ever_married', 'work_type', 'Residence_type',
       'smoking_status'],
      dtype='object')

Label encoding Diabetes Data:
No categorical variables found in the dataset.

Label encoding Heart Attack Data:
Label encoding the following columns: Index(['Sex', 'Diet'], dtype='object')


In [8]:
print(data_heart_attack['Diet'].head(10))
print(data_heart_attack['Diet'].unique())

0    0
1    2
2    1
3    0
4    2
5    2
6    1
7    0
8    0
9    2
Name: Diet, dtype: int64
[0 2 1]


In [9]:
#Basically, all the target variable from the datasets are at the last columns 
def split_data(data):
    "Function to split the data into training and testing"
    X = data.iloc[:, :-1]  # Exclude the last column as features
    y = data.iloc[:, -1]   # Select the last column as target
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

def perform_model(X_train, X_test, y_train, y_test):
    "Function to perform the Random Forest classification model"
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy, model

def save_model(model, name):
    "Function to save a model to a file"
    filename = f"{name.lower().replace(' ', '_')}_model.joblib"
    joblib.dump(model, filename)
    print(f"Model saved as {filename}")
    
def train_models_and_save(datasets):
    "Function to train the model for each dataset"
    for name, data in datasets:
        print("Random Forest for", name)
        X_train, X_test, y_train, y_test = split_data(data)
        accuracy, model = perform_model(X_train, X_test, y_train, y_test)
        print("Accuracy:", accuracy)
        save_model(model, name)
        print()

# Train the model for each dataset and save them
train_models_and_save(datasets)


Random Forest for Stroke Data
Accuracy: 0.9393346379647749
Model saved as stroke_data_model.joblib

Random Forest for Diabetes Data
Accuracy: 0.6948051948051948
Model saved as diabetes_data_model.joblib

Random Forest for Heart Attack Data
Accuracy: 0.6183685111237878
Model saved as heart_attack_data_model.joblib

