In [None]:
# CS677 Assign 11
# by Zuowen Tang

In [None]:
# ------------------------
# SVM
# ------------------------

In [None]:
import pandas as pd
from sklearn import svm
from sklearn.metrics import confusion_matrix


def getData(year):
    df = pd.read_csv('TMO_weekly_label.csv')
    df = df[df['Year'] == year]
    X = df[['Mean Return', 'Volatility']].values
    Y = df['Label'].values
    return X, Y


def getTable(cm, i):
    TP = cm[i][0][0]
    FP = cm[i][0][1]
    FN = cm[i][1][0]
    TN = cm[i][1][1]
    TPR = TP / (TP + FN)
    TNR = TN / (TN + FP)
    ACC = (TP + TN) / (TP + TN + FP + FN)
    d = {'Accuracy': [ACC], 'True positive rate': [TPR], 'True negative rate': [TNR]}
    dfx = pd.DataFrame(data=d)
    return dfx

In [None]:
def SVM():
    x_train, y_train = getData(2021)
    x_test, y_test = getData(2022)
    cm, prediction = [], []

    # Task 1-5
    for i in range(3):
        if i == 0:
            # 1-3. implement a linear SVM.
            svm_classifier = svm.SVC(kernel='linear')
            kern = 'linear'
        elif i == 1:
            # 4. implement a Gaussian SVM
            svm_classifier = svm.SVC(kernel='rbf')
            kern = 'Gaussian'
        else:
            # 5. implement polynomial SVM degree 2
            svm_classifier = svm.SVC(kernel='poly', degree=2)
            kern = 'polynomial'

        svm_classifier.fit(x_train, y_train)
        predicted = svm_classifier.predict(x_test)
        accuracy = svm_classifier.score(x_test, y_test)
        prediction.append(predicted)
        cm.append(confusion_matrix(y_test, predicted))
        dfx = getTable(cm, i)

        print('\nTask', i+1)
        print('Implement a', kern, 'SVM:')
        print('The accuracy is', accuracy)
        print('The confusion matrix is:')
        print(cm[i])
        print(dfx)

    # 6. implement a trading strategy based on your labels (from linear SVM) for year 2
    # and compare the performance with the ”buy-and-hold” strategy.
    # Which strategy results in a larger amount at the end of the year?
    df2 = pd.read_csv("TMO_weekly_label.csv")
    df2 = df2[df2['Year'] == 2022]
    meanReturn = df2['Mean Return']
    print("\nTask 4:")
    print('Money earned based on buy-and-hold strategy for Year2:')
    print("-2.2672499999999984")

    for i in range(3):
        if i == 0:
            kern = 'linear'
        elif i == 1:
            kern = 'Gaussian'
        else:
            kern = 'polynomial'

        meanReturn = list(df2['Mean Return'])
        moneyEarned = 0
        for j in range(52):
            if prediction[i][j] == 'g':
                moneyEarned = moneyEarned + meanReturn[j]
        print('\nNew strategy: only buy when the predicted label is green.')
        print('Money earned based on', kern, 'SVM strategy for Year2:')
        print(moneyEarned)

    print('\nStrategy based on linear SVM has the largest amount at the end of the year.')

In [None]:
SVM()

In [None]:
# ------------------------
# NB, Trees & RF
# ------------------------

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn import svm
import warnings
warnings.filterwarnings('ignore')


def getData():
    # Q1 data preparation
    df = pd.read_excel('cardiotocography_data_set.xls', sheet_name="Raw Data")
    df["NSP"] = np.where(df["NSP"] < 2, 1, 0)
    X = df[["ASTV", "MLTV", "Max", "Median"]].values
    Y = df[["NSP"]].values
    x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5)
    return x_train, x_test, y_train, y_test


def getTable(cm, i, all=False):
    TP = cm[i][0][0]
    FP = cm[i][0][1]
    FN = cm[i][1][0]
    TN = cm[i][1][1]
    TPR = TP / (TP + FN)
    TNR = TN / (TN + FP)
    ACC = (TP + TN) / (TP + TN + FP + FN)
    d = {'Accuracy': [ACC], 'True positive rate': [TPR], 'True negative rate': [TNR]}
    dfx = pd.DataFrame(data=d)
    if all:
        return TP, FP, FN, TN, TPR, TNR, ACC
    return dfx


In [None]:
def Q2_Q7():
    x_train, x_test, y_train, y_test = getData()
    result_table = pd.DataFrame(columns=['Method', 'TP', 'FP', 'FN', 'TN', 'Accuracy', 'TPR', 'TNR'])
    method = ['Naive Bayesian', 'Logistic Regression', 'Decision Tree', 'Random Forest',
              'linear SVM', 'degree 2 SVM', 'Gaussian SVM']
    cm = []

    # 2. Use Naive Bayesian NB classifier
    NB_classifier = GaussianNB().fit(x_train, y_train)
    accuracy = accuracy_score(y_test, NB_classifier.predict(x_test))
    cm.append(confusion_matrix(y_test, NB_classifier.predict(x_test)))
    dfx = getTable(cm, 0)

    print("\nQ2:")
    print('Implement a Naive Bayesian classifier:')
    print('The accuracy is', accuracy)
    print(cm[0])
    print(dfx)

    # 3. Use Logistic regression classifier
    log_reg_classifier = LogisticRegression()
    log_reg_classifier.fit(x_train, y_train)
    prediction = log_reg_classifier.predict(x_test)
    accuracy = log_reg_classifier.score(x_train, y_train)
    cm.append(confusion_matrix(y_test, prediction))
    dfx = getTable(cm, 1)

    print("\nQ3:")
    print('Implement a Logistic regression classifier:')
    print('The accuracy is', accuracy)
    print(cm[1])
    print(dfx)

    # 4. Use Decision Tree
    clf = tree.DecisionTreeClassifier(criterion='entropy')
    clf = clf.fit(x_train, y_train)
    prediction = clf.predict(x_test)
    accuracy = accuracy_score(y_test, prediction)
    cm.append(confusion_matrix(y_test, prediction))
    dfx = getTable(cm, 2)

    print("\nQ4:")
    print('Implement a Decision Tree:')
    print('The accuracy is', accuracy)
    print('Confusion matrix:')
    print(cm[2])
    print(dfx)

    # 5. Use Random Forest classifier
    error_rate = []
    random_forest_table = pd.DataFrame(columns=['n_estimators', 'max_depth', 'accuracy'])
    for i in range(1, 11):
        for j in range(1, 6):
            rf = RandomForestClassifier(n_estimators=i, max_depth=j)
            rf.fit(x_train, y_train)
            error_rate.append(1 - accuracy_score(y_test, rf.predict(x_test)))
            ACC = accuracy_score(y_test, rf.predict(x_test))
            random_forest_table.loc[len(random_forest_table.index)] = [i, j, ACC]

    # plot the error rate
    plt.plot(range(1, 11), error_rate[:10], label="max_depth=1")
    plt.plot(range(1, 11), error_rate[10:20], label="max_depth=2")
    plt.plot(range(1, 11), error_rate[20:30], label="max_depth=3")
    plt.plot(range(1, 11), error_rate[30:40], label="max_depth=4")
    plt.plot(range(1, 11), error_rate[40:50], label="max_depth=5")
    plt.legend()
    plt.xlabel("n_estimators")
    plt.ylabel("error rate")
    #plt.show()

    best_n = error_rate.index(min(error_rate)) % 10 + 1
    best_max = error_rate.index(min(error_rate)) % 5 + 1

    print("\nQ5:")
    print('Implement a Random Forest classifier :')
    print("The best n_estimators and max_depth are", best_n, "and", best_max)

    rf = RandomForestClassifier(n_estimators=best_n, max_depth=best_max)
    rf.fit(x_train, y_train)
    cm.append(confusion_matrix(y_test, rf.predict(x_test)))
    dfx = getTable(cm, 3)
    accuracy = accuracy_score(y_test, rf.predict(x_test))

    print('The accuracy is', accuracy)
    print('Confusion matrix:')
    print(cm[3])
    print(dfx)

    # 6. Use SVM classifier (linear, poly degree 2 and Gaussian)
    for i in range(4, 7):
        if i == 4:
            # linear SVM.
            svm_classifier = svm.SVC(kernel='linear')
            kern = 'linear'
        elif i == 5:
            # Gaussian SVM
            svm_classifier = svm.SVC(kernel='rbf')
            kern = 'Gaussian'
        else:
            # SVM degree 2
            svm_classifier = svm.SVC(kernel='poly', degree=2)
            kern = 'polynomial'

        svm_classifier.fit(x_train, y_train)
        predicted = svm_classifier.predict(x_test)
        accuracy = svm_classifier.score(x_test, y_test)
        cm.append(confusion_matrix(y_test, predicted))
        dfx = getTable(cm, i)

        print('\nQ6 - Task', i-3, ':')
        print('Implement a', kern, 'SVM:')
        print('The accuracy is', accuracy)
        print('Confusion matrix is:')
        print(cm[i])
        print(dfx)

    # 7. Summarize your results for Naive Bayesian, decision tree and random forest in a
    # table below and discuss your findings.

    for i in range(7):
        TP, FP, FN, TN, ACC, TPR, TNR = getTable(cm, i, True)
        new = [method[i], TP, FP, FN, TN, ACC, TPR, TNR]
        result_table = result_table.append(pd.Series(new, index=result_table.columns[:len(new)]), ignore_index=True)

    print('\nQ7:')
    print(result_table)

In [None]:
Q2_Q7()