In [1]:
# import the modules required

import bz2
import fasttext
import csv
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score 

In [2]:
# reading a bz2. fast-text txt file

data = bz2.BZ2File("train.ft.txt.bz2")
data = [a.decode('utf-8') for a in data]

In [3]:
# 3600000 rows of labelled data 
# __label__1 : negative sentiment
# __label__2 : positive sentiment

print(len(data))

3600000


In [4]:
# sentences/reviews in first five rows-- 
data[0:5]

['__label__2 Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^\n',
 "__label__2 The best soundtrack ever to anything.: I'm reading a lot of reviews saying that this is the best 'game soundtrack' and I figured that I'd write a review to disagree a bit. This in my opinino is Yasunori Mitsuda's ultimate masterpiece. The music is timeless and I'm been listening to it for years now and its beauty simply refuses to fade.The price tag on this is pretty staggering I must say, but if you are going to buy any cd for this much money, this is the only one that I feel would be worth every penny.\n",
 '__label__2 Amazing!: This s

In [5]:
data[1800005:1800010]

['__label__2 Exceptionally well-explained copywriting training: Mr Lewis has again written a classic - this book, especially starting with Ch 4 "How to Be a First-Rank Wordsmith", reveals the craft of writing compelling copy with remarkable clarity.I underestimated this book at first; wish I\'d read it a couple of years ago ... along w/Schwartz\' "Breakthrough Advertising" and Lewis\' other copywriting books, this is an absolute must-get. It really does dig deep into comparing the difference between credible, compelling copy and lukewarm copy that doesn\'t sell.A must-get for everyone who write ads, including site salesletters - it\'s a scorcher!\n',
 '__label__2 fascinating: Fennesz is my new favorite music discovery. Whenever I go see a rock show and there is a guy up there with a guitar and a huge cabinet amp full of processors and gadgets, I get excited. Fennesz is that guy but with a laptop. Ambient music gets a good seeing to with an element of live instrumentation. Fennesz, alon

In [6]:
data[3599999]

"__label__2 Makes My Blood Run Red-White-And-Blue: I agree that every American should read this book -- and everybody else for that matter. I don't agree that it's scholarly. Rather, it's a joy to read -- easy to understand even for a person with two master's degrees! Between McElroy's chapter on How American Culture was Formed and Ken Burns' Lewis & Clark, I don't know which makes my blood run red-white-and-bluer. And as a child of the anti-establishment `60s, it's done a lot toward helping me understand why we Americans do what we do. It's the best history book I've ever read, the best history course I've ever taken or taught. I'm buying it for my home library for my grandchildren to use as a resource. We're also using it as a resource for a book on urban planning.\n"

In [7]:
# check for training dataset imbalance

co = 0
ci = 0
for x in data:
    x = data[0]
    if x[9] == '2':        
        co += 1
    else: 
        ci += 1
        
print(co)
print(ci)

3600000
0


In [8]:
# once model is created comment out the code till loading model

In [9]:
# #data prep

data = pd.DataFrame(data)
data.to_csv("train.txt", index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")

In [10]:
#create model

# model = fasttext.train_supervised('train.txt',label_prefix='__label__', thread=4, epoch=10)
# print(model.labels, 'target labels for predicting')

In [11]:
# saving trained model

# model.save_model("model_filename.vec")

In [12]:
#loading saved model
model = fasttext.load_model("model_filename.vec")



In [13]:
# load test file
test_file = bz2.BZ2File("test.ft.txt.bz2")
test_file = test_file.readlines()
test_file = [a.decode('utf-8') for a in test_file]
print(len(test_file), ' records in test file')

400000  records in test file


In [23]:

# _________________________________TODO_________________________________DONE
# instead of running for loop 3 times over same data use for loop once 

# hide labels before trying to predict
# labels are replaced with ''

new_label = [w.replace('__label__2','') for w in test_file]
new_label = [w.replace('__label__1','') for w in new_label]
new_label = [w.replace('\n','') for w in new_label]

# new_label = test_file
# for w in new_label:
#     if w == '__label__1':
#         w.replace('__label__1','')
#     if w == '__label__2':
#         w.replace('__label_2','')
#     if w == '\n':
#         w.replace('\n','')

# predict in new_label using the model
pr = model.predict(new_label)

In [24]:
print(pr[0][0], 'predicted label for row 1')
print(pr[0][1], 'predicted label for row 2')
print(pr[0][2], 'predicted label for row 3')
print(pr[0][3], 'predicted label for row 4')
print(pr[0][4], 'predicted label for row 5')
print(pr[0][5], 'predicted label for row 6')

['__label__2'] predicted label for row 1
['__label__2'] predicted label for row 2
['__label__1'] predicted label for row 3
['__label__2'] predicted label for row 4
['__label__2'] predicted label for row 5
['__label__1'] predicted label for row 6


In [25]:
# evaluate prediction
labels = [0 if a.split(' ')[0] == '__label__1' else 1 for a in test_file]
pr_labels = [0 if a == ['__label__1'] else 1 for a in pr[0]]

#accuracy
print(roc_auc_score(labels, pr_labels))

0.91702
