# Sentiment Analysis with FastText

In [None]:
import fasttext
import numpy as np
import pandas as pd
import random

In [None]:
# Preparing dataset
with open('train_pos_full.txt') as f:
    documents1 = []
    for line in f:
        new_line = '__label__1 ' + line
        documents1.append(new_line)

with open('train_neg_full.txt') as f:
    documents2 = []
    for line in f:
        new_line = '__label__-1 ' + line
        documents2.append(new_line)

In [None]:
documents = documents1 + documents2
random.shuffle(documents)

In [None]:
# Training data
train = open("train_full.txt", "w", encoding='utf-8')
for element in documents:
    train.write(element)
train.close()

In [None]:
# Adjust parameters
# Can also adjust bucket, dim, and loss
model = fasttext.train_supervised(input = 'train_full.txt', lr=0.1, epoch=2, wordNgrams=3)

In [None]:
# Testing data
test = []
with open("test_data.txt") as f:
    for line in f:
        line = ','.join(line.split(',')[1:])
        test.append(line)

In [None]:
# Generate predictions
pred = []
for i in test:
    pred.append(model.predict(i[:-2]))

In [None]:
# Obtain classification labels from the prediction results
lab_pred = []
for i in pred:
    if str(i[0]) == "('__label__1',)":
        lab_pred.append(1)
    else:
        lab_pred.append(-1)

In [None]:
# Create csv submission file
idx = [i for i in range(1,len(lab_pred)+1)]
dict_ = {
    "Id" : idx,
    "Prediction" : lab_pred
}
pred_df = pd.DataFrame(dict_)
pred_df.to_csv("pred1.csv",index = False)

We obtain accuracy of 0.853 and F1 score 0.856 on AIcrowd using this tuned model.

In [None]:
test1 = open("test_mod.txt", "w")
for element in test:
    test1.write(element)
test1.close()

In [None]:
# Using fastText's automatic hyperparameter optimization
model_auto = fasttext.train_supervised(input = 'train_full.txt', autotuneValidationFile='test_mod.txt', autotuneDuration=600)

In [None]:
pred_auto = []
for i in test:
    pred_auto.append(model_auto.predict(i[:-2]))

In [None]:
lab_pred_auto = []
for i in pred_auto:
    if str(i[0]) == "('__label__1',)":
        lab_pred_auto.append(1)
    else:
        lab_pred_auto.append(-1)

In [None]:
idx = [i for i in range(1,len(lab_pred_auto)+1)]
dict_ = {
    "Id" : idx,
    "Prediction" : lab_pred_auto
}
pred_df = pd.DataFrame(dict_)
pred_df.to_csv("pred2.csv",index = False)

The automatic model gives us accuracy 0.816 and F1 score 0.817 on AIcrowd.