In [1]:
# load data

import numpy as np

train_data = np.load('data_train.pkl', allow_pickle=True)
train_data = np.array(train_data)
train_data = train_data.T

test_data = np.load('data_test.pkl', allow_pickle=True)
test_data = np.array(test_data)

In [2]:
# text preprocessing

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer 
stemmer = PorterStemmer()

STOPWORDS = set(stopwords.words('english'))

# function to convert nltk tag to wordnet tag
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None
    
# function to lemmatize sentences
def lemmatize_sentence(sentence):
    # tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    # tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            # if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:        
            # else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)


def clean_text(text):
    text = tokenizer.tokenize(text)

    # make all text lowercase
    text = np.char.lower(text)
    
    # remove all non-alphabetic terms
    text = [word for word in text if word.isalpha()]
    
    # remove all stopwords
    text = ' '.join(word for word in text if word not in STOPWORDS)
    
    # lemmatize words
    text = lemmatize_sentence(text)
    
    # stemming
    text = tokenizer.tokenize(text)
    text = [stemmer.stem(word) for word in text]
    text = ' '.join(text)

    
    return text


train_data[:,0] = np.array([clean_text(ex) for ex in train_data[:,0]])
test_data = np.array([clean_text(ex) for ex in test_data])


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\draby\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# BAG OF WORDS

# create a dictionary of word frequencies
word_dict = {}
for text in train_data[:,0]:
    tokens = text.split(' ')
    for token in tokens:
        if token not in word_dict.keys():
            word_dict[token] = 1
        else:
            word_dict[token] += 1

In [84]:
# use only the words that have a frequency greater than 1
word_dict_reduced = dict(filter(lambda elem: elem[1] > 1, word_dict.items()))

In [85]:
len(word_dict_reduced)

24833

In [7]:
# create a matrix of word frequencies for each example
def vectorize(text):
    words = text.split(' ')
    words = np.array(words)
    sentence_vector = []
    for term in word_dict_reduced:
        sentence_vector.append(np.sum(words == term))
    
    # normalize the frequencies
    sentence_vector = np.array(sentence_vector)
    
    if sentence_vector.max() > 0:
        sentence_vector = sentence_vector / sentence_vector.max()
    
    return sentence_vector
    
    
X = np.array([vectorize(ex) for ex in train_data[:,0]])
y = train_data[:,1]
test_data = np.array([vectorize(ex) for ex in test_data])


In [None]:
# ______________NAIVE BAYES____________

In [8]:
# split our training and validation data
import random

def data_split(X, y, ratio):
    # set seed
    random.seed(3395)
    
    # choose indexes for the train and validation data
    inds = list(range(X.shape[0]))
    random.shuffle(inds)
    train_inds = inds[:int(X.shape[0] * ratio)]
    validation_inds = inds[int(X.shape[0] * ratio):]
    
    # split the data into both sets
    train_X = X[train_inds, :]
    validation_X = X[validation_inds, :]
    train_y = y[train_inds]
    validation_y = y[validation_inds]
    
    return train_X, validation_X, train_y, validation_y

In [None]:
class LaplaceMaxLikelihood:
    
    def __init__(self, n_dims):
        self.log_likelihood = np.zeros(n_dims)

    def train(self, train_data):
        word_frequencies = np.sum(train_data, axis=0) + 0.1
        self.log_likelihood = np.log(word_frequencies / np.sum(word_frequencies))    

    def loglikelihood(self, test_data):
        log_prob = np.dot(test_data, self.log_likelihood)
        return log_prob

In [None]:
class BayesClassifier:

    def __init__(self,models_ml, priors):
        self.models_ml = models_ml
        self.priors = priors
        if len(self.models_ml) != len(self.priors):
            print('The number of ML models must be equal to the number of priors!')
        self.n_classes = len(self.models_ml)
        
    def loglikelihood(self, test_data, eval_by_group=False):
        log_pred = np.empty((test_data.shape[0],self.n_classes))
        
        for i in range(self.n_classes):
            log_pred[:,i] = self.models_ml[i].loglikelihood(test_data) +  np.log(self.priors[i])

        return log_pred

In [78]:
# create a model for each class using its maximum likelihood and train the corresponding data
labels = np.unique(y)

models = []
priors = []

train_X = []
train_y = []
validation_X = []
validation_y = []

for label in labels:
    X_class = X[y == label]
    y_class = y[y == label]
    class_model = LaplaceMaxLikelihood(X_class.shape[1]) #choose Estimator
    
    train_X_class, validation_X_class, train_y_class, validation_y_class = data_split(X_class, y_class, 0.7)
    
    if len(train_X) == 0:
        train_X = np.array(train_X_class)
        train_y = np.array(train_y_class)
        validation_X = np.array(validation_X_class)
        validation_y = np.array(validation_y_class)
    else:
        train_X = np.vstack((train_X, np.array(train_X_class)))
        train_y = np.hstack((train_y, np.array(train_y_class)))
        validation_X = np.vstack((validation_X, np.array(validation_X_class)))
        validation_y = np.hstack((validation_y, np.array(validation_y_class)))
    
    class_model.train(train_X_class)
    models.append(class_model)
    priors.append(X_class.shape[0] / X.shape[0])
    
    
# create our classifier using our class models and priors
classifier = BayesClassifier(models, priors)

In [79]:
def get_accuracy(test_inputs, test_labels):
    log_prob = classifier.loglikelihood(test_inputs)
    classes_pred = labels[log_prob.argmax(1)]
    return np.mean(classes_pred == test_labels)


print("The training accuracy is : {:.1f} % ".format(100 * get_accuracy(train_X, train_y)))
print("The validation accuracy is : {:.1f} % ".format(100 * get_accuracy(validation_X, validation_y)))


The training accuracy is : 75.9 % 
The validation accuracy is : 55.5 % 


In [80]:
# generate predictions for the test data

log_prob = classifier.loglikelihood(test_data)
classes_pred = labels[log_prob.argmax(1)]

In [81]:
# export predictions to csv file

import csv

with open('test.csv', 'w', newline='') as myfile:
    wr = csv.writer(myfile)
    wr.writerow(['Id', 'Category'])
    i=0
    for word in classes_pred:
        wr.writerow([i, word])
        i+=1