In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

In [7]:
#bag of words class, made during ml lab
class BagOfWords:
    def __init__(self):
        self.vocabulary = dict()
        self.words = []  
        
        
    def build_vocabulary(self, data):
        for sentence in data:
            for word in sentence:
                if word not in self.vocabulary:
                    self.vocabulary[word] = len(self.vocabulary)
                    self.words.append(word)
                    
            
    def get_features(self, data):
        features = np.zeros((len(data), len(self.vocabulary)))
        
        for id_sen, document in enumerate(data):
            for word in document:
                if word in self.vocabulary:
                    features[id_sen, self.vocabulary[word]] += 1
                    
        return features

In [None]:
#getting the data
def load_sample(file_name):
    f = open(file_name, 'r', encoding='utf8')
    
    indexes = []
    sentences = []
    
    for line in f.readlines():
        indexes.append(int("".join(line[:6])))
        sentences.append(line[7:].strip('\n').split())
        
    return indexes, sentences


def load_label(file_name):
    f = open(file_name, 'r', encoding='utf8')
    
    sentences = []
    
    for line in f.readlines():
        sentences.append(int(line[7]))
        
    return sentences

In [None]:
#train data
train_indexes, train_samples = load_sample("data/train_samples.txt")
train_labels = load_label("data/train_labels.txt")

#validation data
validation_indexes, validation_samples = load_sample("data/validation_samples.txt")
validation_labels = load_label("data/validation_labels.txt")

#test data
test_indexes, test_samples = load_sample("data/test_samples.txt")

In [23]:
#initialize the gradient boosting model
gradient_model = GradientBoostingClassifier()

#initialize the count vector for our words
count_vector = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
vocabulary = train_samples + test_samples
count_vector.fit(vocabulary)

train_samples_count = count_vector.transform(train_samples)
validation_samples_count = count_vector.transform(validation_samples)
test_samples_count = count_vector.transform(test_samples)

learning_rate = [0.1]
estimators = [50, 100, 150]

for lr in learning_rate:
    for est in estimators:
        gradient_model = GradientBoostingClassifier(n_estimators=est, learning_rate=lr)
        gradient_model.fit(train_samples_count, train_labels)

        predicted = gradient_model.predict(validation_samples_count)

        print(lr, est, accuracy_score(predicted, validation_labels))

0.1 50 0.6042
0.1 100 0.6352
0.1 150 0.6476


In [21]:
gradient_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.5)
gradient_model.fit(train_samples_count, train_labels)

#get the prediction on the test data
predicted = gradient_model.predict(test_samples_count)

#and write it in the csv
g = open("data/test_labels.txt", 'w')
g.write("id,label\n")

for idx in range(len(predicted)):
    g.write(f"{test_indexes[idx]},{predicted[idx]}\n")