In [3]:
import pandas as pd
import numpy as np

In [14]:
class Preprocessor:
    def __init__(self, traind_raw_filename, testd_raw_filename):
        self.traind_raw = pd.read_csv(traind_raw_filename)
        self.testd_raw = pd.read_csv(testd_raw_filename)

        # Split the attributes into different categories
        self.numeric_cols = ["Participant ID", "Age", "Income", "Systolic", "Diastolic",
                             "Pulse", "BMI", "HDL", "Trig", "LDL", "TCHOL", "eGFR"]
        self.categorical_cols = ["Sex", "Race", "CurrentSmoker", "Diabetes", "Insurance"]
        self.ordinal_cols = ["Edu"]
        self.label_col = "stroke"

        # Define attribute interactions
        self.interactions = {
        }

        # Drop specific columns from the data set
        # Part ID: Useless attribute
        # LDL: 56% data missing
        # Trig: 56% data missing
        # eGFR: Strong negative correlation with Age
        self.dropped_cols = ["Participant ID", "LDL", "Trig", "eGFR"]

        # Initialize empty stat dicts
        self.means = {}
        self.stds = {}

        # Preprocess the data
        self.traind_prep, self.trainl_prep = self._preprocess(self.traind_raw)
        self.testd_prep, self.testl_prep = self._preprocess(self.testd_raw)


    def _preprocess(self, data, test_data=False):
        data_c = data.copy()

        # Drop any row missing the target label
        if self.label_col in data_c.columns:
            data_c = data_c.dropna(subset=[self.label_col])

        # Resample
        if not test_data:
            minority = data_c[data_c["stroke"] == 1]
            majority = data_c[data_c["stroke"] == 2].sample(int(len(minority)),random_state=20)
            data_c = pd.concat([minority, majority])

        # Create interaction columns
        for new_col, cols in self.interactions.items():
            data_c[new_col] = data_c[cols].prod(axis=1)
            if not test_data:
                self.numeric_cols.append(new_col)
                self.means[new_col] = data_c[new_col].mean()
                self.stds[new_col] = data_c[new_col].std(ddof=0)

        # Calculate stats of training data
        if not test_data:
            self.training_data_numeric_medians = data_c[self.numeric_cols].median()
            self.training_data_ordinal_medians = data_c[self.ordinal_cols].median()
            self.means = data_c[self.numeric_cols].mean()
            self.stds = data_c[self.numeric_cols].std(ddof=0)

        # Impute missing data
        data_c[self.numeric_cols] = data_c[self.numeric_cols].fillna(self.training_data_numeric_medians)
        data_c[self.ordinal_cols] = data_c[self.ordinal_cols].fillna(self.training_data_ordinal_medians)

        # Recalculate the interaction data. I could not find a way around doing this because the 
        #    interaction keys are added to the object's fields, which WILL raise a key error when
        #    preprocessing test data after training data. This approach is computationally inefficient, 
        #    but should work.
        if not test_data:
            for col, cols in self.interactions.items():
                data_c[col] = data_c[cols[0]] * data_c[cols[1]]
                self.means[col] = data_c[col].mean()
                self.stds[col] = data_c[col].std(ddof=0)
                self.training_data_numeric_medians[col] = data_c[col].median()

        # One-hot encoded vars
        data_c = pd.get_dummies(data_c, columns=self.categorical_cols)
        if not test_data:
            self.used_columns = data_c.columns
        if test_data:
            # Impute missing categorical data with 0
            data_c = data_c.reindex(columns=self.used_columns, fill_value=0)

        # Standardize numeric vars
        data_c[self.numeric_cols] = (data_c[self.numeric_cols] - self.means) / self.stds

        data_c = data_c.drop(columns=self.dropped_cols)
        if self.label_col in data_c.columns:
            x = data_c.drop(columns=[self.label_col]).values
            y = data_c[self.label_col].values
        else:
            x = data_c.values
            y = None
        return x.astype(float), y

In [22]:
trainingDataFileName = "P2_data_stroke_train.csv"
testingDataFileName = "P2_data_stroke_test.csv"

pp = Preprocessor(trainingDataFileName, testingDataFileName)
