In [7]:
# Import definition
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelBinarizer
from sklearn.neighbors import KNeighborsClassifier
from scipy.io import arff
from scipy.stats import ttest_ind
import pandas as pd
from sklearn.naive_bayes import MultinomialNB

GROUP_NUMBER = 16  # Our group number

In [8]:
# Loading dataset into working desk
data = arff.loadarff('breast.w.arff')
df = pd.DataFrame(data[0])

# Removes NaN values from dataset by deleting rows
df.dropna(axis=0, how="any", inplace=True)

In [9]:
# Gets X (data matrix) and y (target values column matrix)
X = df.drop("Class", axis=1).to_numpy()
y = df["Class"].to_numpy()

# Performs some preprocessing by turning labels into binaries (benign is 1)
# We are doing a "double conversion" to convert everything to Binary type
for count, value in enumerate(y):
    if value == b"benign":
        y[count] = "yes"
    else:
        y[count] = "no"
lb = LabelBinarizer()
y = lb.fit_transform(y)

# Holds accuracy for each model to be latter used in t-test
knn_acc = []
mnb_acc = []

In [10]:
# Creates a k fold cross validator
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=GROUP_NUMBER)

# Creates KNN classifier for 3 neighbours
knn = KNeighborsClassifier(3, weights="uniform", p=2, metric="minkowski")

# For each train/test set, we use a KNN classifier
for train_index, test_index in skf.split(X, y):

    # Uses indexes to fetch which values are going to be used to train and test
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Trains knn classifier
    knn.fit(X_train, y_train.ravel())

    # Uses testing data and gets model accuracy
    acc = knn.score(X_test, y_test)

    # Appends accuracy to be latter used as input in a t-test to compare with gnb
    knn_acc.append(acc)

print(sum(knn_acc) / 10)

0.9707374254049445


In [11]:
# Creates a Multinomial Naive Bayes classifier (since the question tells us to use "multinomial assumption")
mnb = MultinomialNB()

# For each train/test set, we use train a Naive Bayes classifier
for train_index, test_index in skf.split(X, y):

    # Uses indexes to fetch which values are going to be used to train and test
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Trains gnb classifier
    mnb.fit(X_train, y_train.ravel())

    # Uses testing data and gets model accuracy
    acc = mnb.score(X_test, y_test)

    # Appends accuracy to be latter used as input in a t-test to compare with knn
    mnb_acc.append(acc)

print(sum(mnb_acc) / 10)

0.9033887468030691


In [12]:
# Uses a t-test to compare both models and determine which one is better
statistic, p_value = ttest_ind(knn_acc, mnb_acc, nan_policy="omit", alternative="two-sided")

print(f"statistic: {statistic} | p_value: {p_value}")


statistic: 6.152729145887098 | p_value: 8.255367260782866e-06
