In [None]:
def initialize_df():
    df = pd.read_csv("MIT-BIH Arrhythmia Database.csv")
    if 'record' in df.columns:
        df = df.drop(columns=['record'])
    df = df[df['type'] != 'Q']
    label_encoder = LabelEncoder()
    df['type'] = label_encoder.fit_transform(df['type'])
    return df

def initialize_data(df):
    X = df.drop(columns=['type'])
    y = df['type']
    return X, y

def clean_data(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.9, random_state=42)

    # Initialize the StandardScaler
    scaler = StandardScaler()

    # Fit the scaler on the training data and transform both training and test data
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Apply SMOTE to the training data, to balance the classes
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)
    return X_train_resampled, y_train_resampled, X_test_scaled, y_test

def get_accuracy(y_pred, y_test):
    accuracy = 1 - np.sum(np.abs(y_pred - y_test)) / np.shape(y_pred)[0]
    return accuracy

def test_prediction(df):
    #Create Weak Learner
    rf_model = sklearn.ensemble.RandomForestClassifier(n_estimators=10, random_state=42, n_jobs=-1)
    X, y = initialize_data(df)
    X_train, y_train, X_test, y_test = clean_data(X, y)
    rf_model.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    return get_accuracy(y_pred, y_test)

def print_statistics(y_pred, y_test):
    #Visualize the Confusion Matrix
    print("The Confusion Matrix of the Model is:\n", confusion_matrix(y_test, y_pred))
    #Calculate the Accuracy
    accuracy = get_accuracy(y_pred, y_test)
    print("The Accuracy of the Model is:", accuracy, "/ 1.0")
    #Calculate the Precision, Recall, and F1 Score
    prf = precision_recall_fscore_support(y_pred, y_test, average='weighted')
    print("The Precision of the Model is:", prf[0], "/ 1.0")
    print("The Recall of the Model is:", prf[1], "/ 1.0")
    print("The F1 Score is:", prf[2])
    print()    

Feature Selection: Filter Method

In [None]:
#Filter Method: Calculate Pearson Coefficient
df = initialize_df()
res = list()
for c in df.columns:
    res.append((c, stats.pearsonr(df[c], df["type"]).statistic))

#Drop Uncorrelated Features
print("Uncorrelated Features:")
for p in res:
    r = p[1]
    if np.abs(r) < 0.1: #All features have a coefficient < 0.5, so we lower the minimum to remove them to < 0.1
        print(p[0], p[1])
        df = df.drop(columns=[p[0]])

#Find Accuracy
y_pred_filter = test_prediction(df)

Feature Selection: Wrapper Method

In [None]:
#Wrapper Method: Start with Full Set of Features & Begin Removing Features Based on which increases accuracy the most
df = initialize_df()
cond = True
while cond == True:
    curr = 'type'
    wrap_accuracy = test_prediction(df)
    print("Initial Performance:", wrap_accuracy)
    max_ = wrap_accuracy
    for c in df.drop(columns=['type']).columns:
        pred = test_prediction(df.drop(columns=[c]))
        print("Remove", c, "Performance", pred)
        if pred > max_:
            curr = c
            max_ = pred
    #max = y_pred_wrap
    #for i in range(len(predictions)):
        #if predictions[i][
    if curr == 'type':
        print("Current Set Best Performance")
        cond = False
    else:
        print("Dropped Feature:", curr)
        df = df.drop(columns=[curr])

#Find Accuracy
y_pred_wrap = test_prediction(df)