In [44]:
#importing the modules we need
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
from sklearn.naive_bayes import BernoulliNB
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from scipy.sparse import csr_matrix
import numpy as np

In [45]:
#bag of words class, made during ml lab
class BagOfWords:
    def __init__(self):
        self.vocabulary = dict()
        self.words = []  
        
        
    def build_vocabulary(self, data):
        for sentence in data:
            for word in sentence:
                if word not in self.vocabulary:
                    self.vocabulary[word] = len(self.vocabulary)
                    self.words.append(word)
                    
            
    def get_features(self, data):
        features = np.zeros((len(data), len(self.vocabulary)))
        
        for id_sen, document in enumerate(data):
            for word in document:
                if word in self.vocabulary:
                    features[id_sen, self.vocabulary[word]] += 1
                    
        return csr_matrix(features)

In [46]:
#getting the data
def load_sample(file_name):
    f = open(file_name, 'r', encoding='utf8')
    
    indexes = []
    sentences = []
    
    for line in f.readlines():
        indexes.append(int("".join(line[:6])))
        sentences.append(line[7:].strip('\n').split())
        
    return indexes, sentences


def load_label(file_name):
    f = open(file_name, 'r', encoding='utf8')
    
    sentences = []
    
    for line in f.readlines():
        sentences.append(int(line[7]))
        
    return sentences

In [47]:
#train data
train_indexes, train_samples = load_sample("data/train_samples.txt")
train_labels = load_label("data/train_labels.txt")

#validation data
validation_indexes, validation_samples = load_sample("data/validation_samples.txt")
validation_labels = load_label("data/validation_labels.txt")

#test data
test_indexes, test_samples = load_sample("data/test_samples.txt")

In [48]:
bow = BagOfWords()
bow.build_vocabulary(train_samples + validation_samples)

train_features = bow.get_features(train_samples + validation_samples)
train_labels = train_labels + validation_labels
validation_features = bow.get_features(validation_samples)
test_features = bow.get_features(test_samples)

In [49]:
#train the model
multinomial_model = MultinomialNB(alpha=1.045)
multinomial_model.fit(train_features, train_labels)

In [50]:
predicted_multinomial_nb = multinomial_model.predict(test_features)

predicted = multinomial_model.predict(validation_features)
percentege_multinomial_nb = np.mean(predicted == validation_labels)
print(percentege_multinomial_nb)

0.9286


In [51]:
complement_model = ComplementNB(alpha=1.635)
complement_model.fit(train_features, train_labels)

In [52]:
predicted_complement_nb = complement_model.predict(test_features)

predicted = complement_model.predict(validation_features)
percentege_complement_nb = np.mean(predicted == validation_labels)
print(percentege_complement_nb)

0.9308


In [53]:
bernoulli_model = BernoulliNB(alpha=0.86)
bernoulli_model.fit(train_features, train_labels)

In [54]:
predicted_bernoulli_nb = bernoulli_model.predict(test_features)

predicted = bernoulli_model.predict(validation_features)
percentege_bernoulli_nb = np.mean(predicted == validation_labels)
print(percentege_bernoulli_nb)

0.9174


In [55]:
#train the model
xgboost_model = XGBClassifier(learning_rate=0.15, gamma=0.5, reg_alpha=0.5, max_depth=10, subsample=0.8, colsample_bytree=0.75, n_estimators=475)

train_labels_xgboost = train_labels
for i in range(len(train_labels_xgboost)):
    train_labels_xgboost[i] -= 1

xgboost_model.fit(train_features, train_labels)

In [56]:
predicted_xgboost = xgboost_model.predict(test_features)

for i in range(len(predicted_xgboost)):
    predicted_xgboost[i] += 1



predicted = xgboost_model.predict(validation_features)

for i in range(len(predicted)):
    predicted[i] += 1

percentege_xgboost = np.mean(predicted == validation_labels)
print(percentege_xgboost)

0.9826


In [57]:
for i in range(len(train_labels_xgboost)):
    train_labels_xgboost[i] += 1

In [58]:
predicted = []

for i in range(len(predicted_multinomial_nb)):
    predict = np.zeros(4)

    predict[predicted_multinomial_nb[i]] += percentege_multinomial_nb
    predict[predicted_complement_nb[i]] += percentege_complement_nb
    predict[predicted_bernoulli_nb[i]] += percentege_bernoulli_nb
    predict[predicted_xgboost[i]] += percentege_xgboost

    predicted.append(np.argmax(predict))

In [61]:
print(predicted[:10])

[2, 2, 1, 1, 2, 1, 1, 2, 2, 2]


In [60]:
g = open("data/test_labels.txt", 'w')
g.write("id,label\n")

for idx in range(len(predicted)):
    g.write(f"{test_indexes[idx]},{predicted[idx]}\n")