In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

In [2]:
#getting the data
def load_sample(file_name):
    f = open(file_name, 'r', encoding='utf8')
    
    indexes = []
    sentences = []
    
    for line in f.readlines():
        indexes.append(int("".join(line[:6])))
        sentences.append(line[7:].strip('\n'))
        
    return indexes, sentences


def load_label(file_name):
    f = open(file_name, 'r', encoding='utf8')
    
    sentences = []
    
    for line in f.readlines():
        sentences.append(int(line[7]))
        
    return sentences

In [3]:
#train data
train_indexes, train_samples = load_sample("data/train_samples.txt")
train_labels = load_label("data/train_labels.txt")

#validation data
validation_indexes, validation_samples = load_sample("data/validation_samples.txt")
validation_labels = load_label("data/validation_labels.txt")

#test data
test_indexes, test_samples = load_sample("data/test_samples.txt")

In [23]:
#initialize the gradient boosting model
gradient_model = GradientBoostingClassifier()

#initialize the count vector for our words
count_vector = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
vocabulary = train_samples + test_samples
count_vector.fit(vocabulary)

train_samples_count = count_vector.transform(train_samples)
validation_samples_count = count_vector.transform(validation_samples)
test_samples_count = count_vector.transform(test_samples)

learning_rate = [0.1]
estimators = [50, 100, 150]

for lr in learning_rate:
    for est in estimators:
        gradient_model = GradientBoostingClassifier(n_estimators=est, learning_rate=lr)
        gradient_model.fit(train_samples_count, train_labels)

        predicted = gradient_model.predict(validation_samples_count)

        print(lr, est, accuracy_score(predicted, validation_labels))

0.1 50 0.6042
0.1 100 0.6352
0.1 150 0.6476


In [7]:
#initialize the count vector for our words
count_vector = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
vocabulary = train_samples + validation_samples + test_samples
print(vocabulary[0])

['Uq$%y', 'gkuKDuZ*KmH', 'quf&', 'qKf', 'Du*&', 'Du*ZX;', 'Kf', 'DY*fHm', 'ZuK&', 'XZ*KTYDX&kK', 'K;YuE', '*u', 'AKmH', 'KEHqqH&', '*u', 'ZK', ',fKq', 'K;', 'qXZYZ*Huh', 'qK;*HDE', 'K;', 'DKZf*AXD', 'p', 'Z**m', 'Yuh', 'qDuK*YT', 'qK;', 'KEHTY’f', 'U', ',ènSè', 'Hf', 'q*H;', 'KDmYmDuq', 'K;', 'Kf#Yqq*&', 'qY*sKZu*Z', 'qséȘ;', 'ZqK', 'fI', 'gkuKDKE*H;', 'Z*KTKDCYfuAYZDH&', 'qsYZAH', 'q*HAf*m', 'qK;', 'ZK', ',qKuhYk*Z', 'NHE', 'qK;', ',Z*HffYu*#', 'KuhYDusfuq', 'K;YAH’;', 'qKuhqHm', 'qK;', 'Kmu*DZ', 'p', '**', ',ènSè', '°', '85', 'KD;*YKZZH', 'Z*KmuK&', 'Yuh', 'qKDuZHDX&TKZ', 'qK;', 'quf&', '*n', 'gqKTRDZkK', 'q**YZY;**A', 'qK;', 'ZY*uXD', 'ènSè', 'Kf', ',»', 'ZD*T', 'Hf', 'K;', 'KDY*ZYDDKZ', '«', 'XqYZ&H#', 'H', 'ènSè', 'Yuh', 'ènSè', 'ènSè', 'DuKZHD*f&kK', 'ènSè', 'DH&', '0391', 'qKX**H', 'qKf', 'q*H;', 'ènSè']


In [None]:
count_vector.fit(vocabulary)

all_train_samples = train_samples + validation_samples
all_train_samples_count = count_vector.transform(all_train_samples)

gradient_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.5)
gradient_model.fit(all_train_samples_count, train_labels)

In [None]:
#get the prediction on the test data
predicted = gradient_model.predict(test_samples_count)

#and write it in the csv
g = open("data/test_labels.txt", 'w')
g.write("id,label\n")

for idx in range(len(predicted)):
    g.write(f"{test_indexes[idx]},{predicted[idx]}\n")