In [1]:
#importing the modules we need
from sklearn.naive_bayes import MultinomialNB
from sklearn import preprocessing
import numpy as np

In [2]:
# the normalize function
def normalize_data(train_data, test_data, type1=None):
    scaler = None

    if type1 == 'standard':
        scaler = preprocessing.StandardScaler()
        scaler.fit(train_data)
        
        scaled_train_data = scaler.transform(train_data)
        scaled_test_data = scaler.transform(test_data)
        
        return scaled_train_data, scaled_test_data
    
    elif type1 == 'l1':
        scaler = preprocessing.Normalizer(norm='l1')
        scaler.fit(train_data)
        
        scaled_train_data = scaler.transform(train_data)
        scaled_test_data = scaler.transform(test_data)
        
        return scaled_train_data, scaled_test_data
    
    elif type1 == 'l2':
        scaler = preprocessing.Normalizer(norm='l2')
        scaler.fit(train_data)
        
        scaled_train_data = scaler.transform(train_data)
        scaled_test_data = scaler.transform(test_data)
        
        return scaled_train_data, scaled_test_data

In [3]:
#bag of words class, made during ml lab
class BagOfWords:
    def __init__(self):
        self.vocabulary = dict()
        self.words = []  
        
        
    def build_vocabulary(self, data):
        for sentence in data:
            for word in sentence:
                if word not in self.vocabulary:
                    self.vocabulary[word] = len(self.vocabulary)
                    self.words.append(word)
                    
            
    def get_features(self, data):
        features = np.zeros((len(data), len(self.vocabulary)))
        
        for id_sen, document in enumerate(data):
            for word in document:
                if word in self.vocabulary:
                    features[id_sen, self.vocabulary[word]] += 1
                    
        return features

In [4]:
#getting the data
def load_file(file_name, r_idx = False, label = False):
    f = open(file_name, 'r', encoding='utf8')
    
    indexes = []
    sentences = []
    
    for line in f.readlines():
        indexes.append(int("".join(line[:6])))
        sentences.append(line[7:].strip('\n').split())
    
        
    if r_idx == True:
        return indexes, sentences
    else: 
        return sentences


#train data
train_indexes, train_samples = load_file("data/train_samples.txt", True)
train_labels = load_file("data/train_labels.txt")

#validation data
validation_indexes, validation_samples = load_file("data/validation_samples.txt", True)
validation_indexes2, validation_labels = load_file("data/validation_labels.txt", True)

#test data
test_indexes, test_samples = load_file("data/test_samples.txt", True)

In [5]:
bow = BagOfWords()
bow.build_vocabulary(train_samples)

train_features = bow.get_features(train_samples)
validation_features = bow.get_features(validation_samples)
test_features = bow.get_features(test_samples)

train_features_norm, validation_features_norm = normalize_data(train_features, validation_features, type1='l2')
# train_features_norm, test_features_norm = normalize_data(train_features, test_features, type1='l2')

In [6]:
#train the model
naive_bayes_model = MultinomialNB()
naive_bayes_model.fit(train_features_norm, train_labels)

  return f(*args, **kwargs)


MultinomialNB()

In [44]:
#get the prediction on the validation data
predicted = naive_bayes_model.predict(validation_features_norm)

print(np.mean(predicted == validation_labels))

0.393


In [7]:
print(train_labels)

[['1'], ['1'], ['2'], ['1'], ['2'], ['3'], ['1'], ['2'], ['1'], ['1'], ['3'], ['3'], ['2'], ['1'], ['1'], ['3'], ['3'], ['1'], ['1'], ['1'], ['3'], ['2'], ['2'], ['1'], ['3'], ['3'], ['2'], ['1'], ['1'], ['3'], ['2'], ['2'], ['2'], ['1'], ['2'], ['3'], ['3'], ['1'], ['1'], ['1'], ['1'], ['3'], ['3'], ['3'], ['2'], ['2'], ['3'], ['1'], ['2'], ['3'], ['1'], ['3'], ['1'], ['1'], ['3'], ['3'], ['1'], ['3'], ['3'], ['3'], ['2'], ['1'], ['3'], ['1'], ['3'], ['3'], ['2'], ['1'], ['2'], ['1'], ['1'], ['3'], ['1'], ['1'], ['1'], ['3'], ['3'], ['3'], ['1'], ['3'], ['1'], ['1'], ['2'], ['3'], ['2'], ['3'], ['3'], ['1'], ['1'], ['3'], ['1'], ['1'], ['3'], ['1'], ['3'], ['2'], ['2'], ['3'], ['2'], ['2'], ['2'], ['1'], ['1'], ['1'], ['1'], ['1'], ['3'], ['1'], ['3'], ['1'], ['1'], ['1'], ['1'], ['3'], ['3'], ['3'], ['1'], ['1'], ['2'], ['2'], ['1'], ['3'], ['3'], ['1'], ['2'], ['3'], ['2'], ['3'], ['1'], ['3'], ['3'], ['1'], ['3'], ['1'], ['1'], ['1'], ['3'], ['3'], ['1'], ['2'], ['1'], ['2'], ['1']

In [None]:
#get the prediction on the test data
predicted = naive_bayes_model.predict(test_features_norm)

#and write it in the csv
g = open("data/test_labels.txt", 'w')
g.write("id,label\n")

for idx in range(len(predicted)):
    g.write(f"{test_indexes[idx]},{predicted[idx]}\n")