In [9]:
import scipy.io
import random
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import importlib
import sys 
import os
sys.path.append(".")
# reload module in case that module changes
import text_preprocessing
importlib.reload(text_preprocessing)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jsjhf\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\jsjhf\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\jsjhf\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


<module 'text_preprocessing' from 'C:\\Users\\jsjhf\\Desktop\\course\\coursework\\NLP\\hw\\hw2\\text_preprocessing.py'>

In [10]:
# load txt files
base_path = "20_newsgroups"
class_paths = os.listdir(base_path)
txt_paths = []
txt_labels = []
for i in range(len(class_paths)):
    files = os.listdir(os.path.join(base_path, class_paths[i]))
    for file in files:
        txt_paths.append(os.path.join(base_path, os.path.join(class_paths[i], file)))
        txt_labels.append(i)

In [11]:
# text preprocessing
data = text_preprocessing.text_preprocessing(txt_paths)

Start preprocessing...
current: 2000
current: 4000
current: 6000
current: 8000
current: 10000
current: 12000
current: 14000
current: 16000
current: 18000
Finish preprocessing !


In [12]:
# save preprocessed data
news_data = {"data": data, "labels": np.array(txt_labels).reshape((len(txt_labels), 1))}
scipy.io.savemat('data/news_data.mat', news_data)

In [13]:
# load data
load_data = scipy.io.loadmat('data/news_data.mat')
news_data = load_data['data']
news_labels = load_data['labels']

# shuffle
zipped_data = list(zip(news_data, news_labels))  
random.seed(0)
random.shuffle(zipped_data)
new_zipped_data = list(map(list, zip(*zipped_data)))  
news_data, news_labels = np.array(new_zipped_data[0]), np.array(new_zipped_data[1])  

# split data into training, validation and test sets
training_data = news_data[:15000, :]
training_labels = news_labels[:15000]
validation_data = news_data[15000:17500, :]
validation_labels = news_labels[15000:17500]
test_data = news_data[17500:, :]
test_labels = news_labels[17500:]

In [14]:
# Naive Bayes Classifier
class NBC:
    def __init__(self):
        # P(y=c)
        self.log_Pc = 0
        
        # P(F=f|Y=c)
        self.log_prior = 0
        
        # training size
        self.N = 0
        
        # dimension
        self.d = 0
        self.class_size = 0
        
    def compute_log_Pc(self, data, labels):
        log_training_size = np.log(self.N)
        for c in range(self.class_size):
            c_data = data[labels.ravel()==c, :]
            self.log_Pc[c] = np.log(c_data.shape[0]) - log_training_size
            
    def compute_log_prior(self, data, labels, k):
        for c in range(self.class_size):
            c_data = data[labels.ravel()==c, :]
            all_cnt = c_data.sum()
            for f in range(self.d):
                log_cnt = np.log(c_data[:, f].sum()+k)
                log_all_cnt = np.log(all_cnt+k*d)
                self.log_prior[c][f] = log_cnt - log_all_cnt
    
    def fit(self, data, labels, k):
        print("Start fitting...")
        self.N = data.shape[0]
        self.d = data.shape[1]
        self.class_size = len(set(labels.ravel()))
        self.log_Pc = np.zeros((self.class_size, ))
        self.log_prior = np.zeros((self.class_size, self.d))
        self.compute_log_Pc(data, labels)
        self.compute_log_prior(training_data, labels, k)
        print("Finish fitting !")
    
    # Input: n×d
    # Output: n×1
    def predict(self, data):
        pred_list = []
        for X in data:
            max_log_prob = float('-inf')
            pred_c = 0
            for c in range(self.class_size):
                log_prob = self.log_Pc[c]
                for f in range(self.d):
                    log_prob += self.log_prior[c][f]*np.log(1+X[f])
                if log_prob > max_log_prob:
                    max_log_prob = log_prob
                    pred_c = c
            pred_list.append(pred_c)
        return pred_list
    
    def accuracy(self, data, labels):
        print("Start computing accuracy...")
        pred_list = self.predict(data)
        n = len(labels)
        n_accuracy = (pred_list == labels.ravel()).sum()
        print("Finish computing accuracy !")
        return n_accuracy/n

In [15]:
nbc = NBC()
nbc.fit(training_data, training_labels, 1)
nbc.accuracy(validation_data[0:1000, :], validation_labels[0:1000, :])

Start fitting...
Finish fitting !
Start computing accuracy...
Finish computing accuracy !


0.837