In [1]:
import sys
assert sys.version_info >= (3, 5)
# Python ≥3.5 is required

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os
import tarfile
import urllib
import pandas as pd

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

# import train_test_split
from sklearn.model_selection import train_test_split

# naive bayes and metrics import 
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


In [2]:
DOWNLOAD_ROOT = "http://www.macs.hw.ac.uk/%7Eek19/data/"
CURRENT_PATH = os.path.join(os.getcwd(), "datasets")
X_FILE = "x_train_gr_smpl.csv"
Y_FILE ="y_train_smpl.csv"

def fetch_data(download_root=DOWNLOAD_ROOT, current_path=CURRENT_PATH):
    if os.path.isfile(os.path.join(current_path, X_FILE)) or os.path.isfile(os.path.join(current_path, Y_FILE)):
        return
    
    os.makedirs(current_path, exist_ok=True)
    
    urllib.request.urlretrieve(download_root + X_FILE, os.path.join(current_path, X_FILE))
    urllib.request.urlretrieve(download_root + Y_FILE, os.path.join(current_path, Y_FILE))

In [3]:
fetch_data()

In [4]:
def load_features_data(current_path=CURRENT_PATH):
    return pd.read_csv(os.path.join(current_path, X_FILE))

def load_labels_data(current_path=CURRENT_PATH):
    return pd.read_csv(os.path.join(current_path, Y_FILE))

In [5]:
features = load_features_data()
# features.head() # Display the top five rows of the dataframe

In [6]:
labels = load_labels_data()
# labels.head() # Display the top five rows of the dataframe

In [7]:
def display_prediction(expected, predicted):
    print("=====================================================")
    print("Accuracy:",accuracy_score(expected, predicted))
    print("=====================================================")
    print(classification_report(expected, predicted))
    print("=====================================================")
    print(confusion_matrix(expected, predicted))
    print("=====================================================")

In [10]:
def run_naive_bayes(X,Y):
    
    # Splitting dataset into training set and test set, 70/30 training/test, seed = 10
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3,random_state=1)
    
    # training model using naive bayes
    model = GaussianNB()
    model.fit(X_train, Y_train)
    
    # testing the model, getting prediction
    prediction = model.predict(X_test)
    
    display_prediction(Y_test,prediction)

In [11]:
run_naive_bayes(features, labels.values.ravel())

Accuracy: 0.23976608187134502
              precision    recall  f1-score   support

           0       0.14      0.75      0.23        55
           1       0.43      0.22      0.29       669
           2       0.41      0.19      0.26       669
           3       0.43      0.23      0.30       418
           4       0.46      0.14      0.21       613
           5       0.14      0.24      0.17        59
           6       0.09      0.71      0.17       111
           7       0.09      0.55      0.16        80
           8       0.81      0.26      0.39       150
           9       0.27      0.31      0.29        83

    accuracy                           0.24      2907
   macro avg       0.33      0.36      0.25      2907
weighted avg       0.41      0.24      0.26      2907

[[ 41   1   7   2   1   0   0   3   0   0]
 [179 150  82  22  35  14  89  89   1   8]
 [ 30 105 125  57  53  35 119 132   0  13]
 [ 12  15   6  96  10  12 228  26   3  10]
 [ 25  59  68  37  83  12 262  57   0  