# Sentiment Analysis with FastText

In [1]:
import fasttext
import numpy as np
import pandas as pd
import random

In [2]:
# Preparing dataset
with open('train_pos_full.txt') as f:
    documents1 = []
    for line in f:
        new_line = '__label__1 ' + line
        documents1.append(new_line)

with open('train_neg_full.txt') as f:
    documents2 = []
    for line in f:
        new_line = '__label__-1 ' + line
        documents2.append(new_line)

In [8]:
documents = documents1 + documents2
random.shuffle(documents)

In [9]:
# Training data
train = open("train_full.txt", "w", encoding='utf-8')
for element in documents:
    train.write(element)
train.close()

In [29]:
# Adjust parameters
# Can also adjust bucket, dim, and loss
model = fasttext.train_supervised(input = 'train_full.txt', lr=0.1, epoch=2, wordNgrams=3)

In [30]:
# Testing data
test = []
with open("test_data.txt") as f:
    for line in f:
        line = ','.join(line.split(',')[1:])
        test.append(line)

In [31]:
# Generate predictions
pred = []
for i in test:
    pred.append(model.predict(i[:-2]))
pred

[(('__label__-1',), array([0.9787994])),
 (('__label__1',), array([0.51061296])),
 (('__label__-1',), array([0.65210646])),
 (('__label__1',), array([0.97208256])),
 (('__label__-1',), array([0.91530931])),
 (('__label__-1',), array([0.59477878])),
 (('__label__-1',), array([0.87885576])),
 (('__label__1',), array([0.95375609])),
 (('__label__1',), array([0.74091399])),
 (('__label__1',), array([0.71292895])),
 (('__label__1',), array([0.8516463])),
 (('__label__1',), array([0.92654848])),
 (('__label__1',), array([0.64193308])),
 (('__label__-1',), array([0.63597059])),
 (('__label__1',), array([0.86926132])),
 (('__label__-1',), array([0.6349991])),
 (('__label__-1',), array([0.99997604])),
 (('__label__1',), array([0.99859011])),
 (('__label__1',), array([0.92392039])),
 (('__label__-1',), array([0.99965465])),
 (('__label__-1',), array([0.77609617])),
 (('__label__-1',), array([0.77716047])),
 (('__label__1',), array([0.85848606])),
 (('__label__-1',), array([0.99712598])),
 (('__l

In [32]:
# Obtain classification labels from the prediction results
lab_pred = []
for i in pred:
    if str(i[0]) == "('__label__1',)":
        lab_pred.append(1)
    else:
        lab_pred.append(-1)
lab_pred

[-1,
 1,
 -1,
 1,
 -1,
 -1,
 -1,
 1,
 1,
 1,
 1,
 1,
 1,
 -1,
 1,
 -1,
 -1,
 1,
 1,
 -1,
 -1,
 -1,
 1,
 -1,
 1,
 -1,
 1,
 1,
 -1,
 1,
 1,
 -1,
 -1,
 1,
 1,
 1,
 1,
 -1,
 -1,
 1,
 1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 1,
 1,
 -1,
 1,
 -1,
 -1,
 1,
 -1,
 1,
 1,
 1,
 -1,
 -1,
 1,
 1,
 1,
 -1,
 -1,
 1,
 1,
 1,
 -1,
 -1,
 -1,
 1,
 1,
 -1,
 1,
 -1,
 1,
 -1,
 1,
 -1,
 -1,
 1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 1,
 1,
 1,
 1,
 -1,
 -1,
 -1,
 -1,
 1,
 1,
 -1,
 1,
 -1,
 1,
 1,
 -1,
 -1,
 -1,
 -1,
 1,
 1,
 1,
 -1,
 1,
 -1,
 -1,
 -1,
 1,
 -1,
 -1,
 -1,
 1,
 -1,
 -1,
 1,
 -1,
 1,
 -1,
 1,
 -1,
 -1,
 -1,
 -1,
 1,
 -1,
 -1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 -1,
 1,
 1,
 1,
 -1,
 -1,
 -1,
 1,
 -1,
 -1,
 1,
 -1,
 -1,
 -1,
 1,
 1,
 1,
 -1,
 -1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 -1,
 -1,
 -1,
 1,
 1,
 -1,
 1,
 -1,
 -1,
 1,
 -1,
 1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 1,
 -1,
 1,
 -1,
 1,
 1,
 -1,
 1,
 -1,
 -1,
 -1,
 -1,
 1,
 1,
 -1,
 1,
 -1,
 -1,
 1,
 -1,
 -1,
 1,
 -1,
 -1,
 1,
 1,
 -1,
 1,
 1,
 1,
 -1,

In [33]:
# Create csv submission file
idx = [i for i in range(1,len(lab_pred)+1)]
dict_ = {
    "Id" : idx,
    "Prediction" : lab_pred
}
pred_df = pd.DataFrame(dict_)
pred_df.to_csv("pred1.csv",index = False)

We obtain accuracy of 0.853 and F1 score 0.856 on AIcrowd using this tuned model.

In [23]:
test1 = open("test_mod.txt", "w")
for element in test:
    test1.write(element)
test1.close()

In [24]:
# Using fastText's automatic hyperparameter optimization
model_auto = fasttext.train_supervised(input = 'train_full.txt', autotuneValidationFile='test_mod.txt', autotuneDuration=600)

In [26]:
pred_auto = []
for i in test:
    pred_auto.append(model_auto.predict(i[:-2]))
pred_auto

[(('__label__-1',), array([0.9798246])),
 (('__label__-1',), array([0.6071834])),
 (('__label__-1',), array([0.93368298])),
 (('__label__1',), array([0.77279919])),
 (('__label__1',), array([0.58304632])),
 (('__label__-1',), array([0.78987962])),
 (('__label__-1',), array([0.99828756])),
 (('__label__1',), array([0.96913463])),
 (('__label__1',), array([0.67897999])),
 (('__label__1',), array([0.88558227])),
 (('__label__1',), array([0.88849115])),
 (('__label__1',), array([0.84312457])),
 (('__label__-1',), array([0.51840603])),
 (('__label__-1',), array([0.6484552])),
 (('__label__1',), array([0.8478328])),
 (('__label__1',), array([0.65809131])),
 (('__label__-1',), array([0.99988461])),
 (('__label__1',), array([0.88377732])),
 (('__label__1',), array([0.87248546])),
 (('__label__-1',), array([0.99539524])),
 (('__label__-1',), array([0.78095716])),
 (('__label__-1',), array([0.79664242])),
 (('__label__1',), array([0.85861319])),
 (('__label__-1',), array([0.99865109])),
 (('__la

In [27]:
lab_pred_auto = []
for i in pred_auto:
    if str(i[0]) == "('__label__1',)":
        lab_pred_auto.append(1)
    else:
        lab_pred_auto.append(-1)
lab_pred_auto

[-1,
 -1,
 -1,
 1,
 1,
 -1,
 -1,
 1,
 1,
 1,
 1,
 1,
 -1,
 -1,
 1,
 1,
 -1,
 1,
 1,
 -1,
 -1,
 -1,
 1,
 -1,
 1,
 -1,
 1,
 1,
 -1,
 -1,
 1,
 -1,
 1,
 1,
 1,
 1,
 1,
 -1,
 -1,
 1,
 1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 1,
 1,
 -1,
 1,
 1,
 -1,
 1,
 -1,
 1,
 1,
 1,
 -1,
 -1,
 1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 1,
 -1,
 -1,
 1,
 1,
 1,
 -1,
 1,
 -1,
 -1,
 -1,
 1,
 -1,
 -1,
 1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 1,
 1,
 1,
 1,
 -1,
 1,
 -1,
 -1,
 1,
 1,
 -1,
 1,
 -1,
 -1,
 1,
 -1,
 -1,
 -1,
 -1,
 1,
 -1,
 -1,
 -1,
 1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 1,
 -1,
 -1,
 1,
 -1,
 1,
 -1,
 1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 -1,
 1,
 1,
 1,
 -1,
 -1,
 -1,
 1,
 -1,
 1,
 1,
 -1,
 1,
 -1,
 1,
 1,
 1,
 -1,
 -1,
 1,
 1,
 1,
 1,
 1,
 -1,
 1,
 -1,
 -1,
 -1,
 1,
 1,
 -1,
 1,
 1,
 -1,
 1,
 -1,
 1,
 -1,
 1,
 -1,
 -1,
 1,
 -1,
 -1,
 -1,
 1,
 -1,
 1,
 -1,
 1,
 1,
 -1,
 1,
 -1,
 -1,
 -1,
 -1,
 1,
 1,
 -1,
 1,
 -1,
 -1,
 1,
 -1,
 -1,
 1,
 -1,
 -1,
 1,
 1,
 1,
 1,
 1,
 -1,
 

In [28]:
idx = [i for i in range(1,len(lab_pred_auto)+1)]
dict_ = {
    "Id" : idx,
    "Prediction" : lab_pred_auto
}
pred_df = pd.DataFrame(dict_)
pred_df.to_csv("pred2.csv",index = False)

The automatic model gives us accuracy 0.816 and F1 score 0.817 on AIcrowd.