In [8]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import nltk

In [4]:
# Read train data
train = pd.read_csv("labeledTrainData.tsv", delimiter = "\t")
train.shape

(25000, 3)

In [5]:
train.columns.values

array(['id', 'sentiment', 'review'], dtype=object)

In [7]:
train["review"][0]

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally sta

In [10]:
# Try to clean train["review"][0]
removehtml = BeautifulSoup(train["review"][0])
removenonalphabet = re.sub("[^a-zA-Z]", " ", removehtml.get_text())
lowersplit = removenonalphabet.lower().split()

from nltk.corpus import stopwords
removestop = [w for w in lowersplit if not w in stopwords.words("english")]

In [12]:
#Make function to clean data including remove html, punctuations, numbers, stopwords, lower case and split
def cleandata(raw_data):
    removehtml = BeautifulSoup(raw_data).get_text()
    removenonalphabet = re.sub("[^a-zA-Z]", " ", removehtml)
    lowersplit = removenonalphabet.lower().split()
    stops = set(stopwords.words("english"))
    removestop = [w for w in lowersplit if not w in stops]
    return(" ".join(removestop))

# Get the clean_train data
clean_train = []
train_size = train["review"].size
for i in xrange(0, train_size):
    if (i+1) % 5000 == 0:
        print "Reviews now %d in %d\n" % (i+1, train_size)
    clean_train.append(cleandata(train["review"][i]))

Reviews now 5000 in 25000

Reviews now 10000 in 25000

Reviews now 15000 in 25000

Reviews now 20000 in 25000

Reviews now 25000 in 25000



In [14]:
#Make vectorize & features
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer = "word", stop_words = None, preprocessor = None, \
                             tokenizer = None, max_features = 3000)
print vectorizer
print("\n")

train_features = vectorizer.fit_transform(clean_train)
train_features = train_features.toarray()
print train_features

CountVectorizer(analyzer='word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=3000, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)


[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]


In [17]:
#List the features
vocab = vectorizer.get_feature_names()
dist = np.sum(train_features, axis = 0)
print dist
print("\n")

for tag, count in zip(vocab, dist):
    print tag, count

[ 187  454 1259 ...,  740  518  147]


abandoned 187
ability 454
able 1259
absolute 352
absolutely 1485
absurd 306
abuse 192
academy 297
accent 485
accents 203
accept 300
accident 318
accidentally 200
according 295
account 186
accurate 284
achieve 179
across 971
act 1251
acted 658
acting 6490
action 3354
actions 311
actor 2389
actors 4486
actress 1219
actresses 369
acts 394
actual 793
actually 4237
ad 148
adam 302
adaptation 453
adapted 154
add 810
added 439
adding 166
addition 347
adds 337
admit 621
adult 510
adults 376
advantage 153
adventure 510
adventures 204
advice 259
affair 346
afraid 343
africa 212
african 255
afternoon 187
age 1121
aged 233
agent 361
ages 249
ago 1033
agree 572
ahead 396
air 639
aired 146
aka 194
al 376
alan 351
alas 163
albeit 157
albert 265
alex 231
alice 199
alien 373
aliens 199
alike 152
alive 463
allen 407
allow 308
allowed 325
allows 252
almost 3139
alone 1061
along 1776
already 1381
alright 185
also 9155
although 2537
always 3239
amateur 215
amateurish 

fall 770
fallen 165
falling 383
falls 851
false 193
fame 230
familiar 538
families 239
family 3200
famous 771
fan 1911
fans 1421
fantastic 798
fantasy 649
far 2978
fare 210
fascinating 391
fashion 341
fast 897
fat 275
fate 271
father 2123
fault 240
favor 250
favorite 1232
favorites 187
favourite 329
fbi 153
fear 538
feature 791
featured 192
features 643
featuring 277
feel 2949
feeling 1145
feelings 395
feels 810
feet 236
fell 346
fellow 372
felt 1528
female 944
festival 399
fi 661
fiction 476
fictional 188
field 290
fight 1148
fighting 607
fights 285
figure 758
figured 187
figures 191
fill 231
filled 551
film 40140
filmed 762
filming 393
filmmaker 334
filmmakers 566
films 6886
final 1329
finale 267
finally 1536
find 4131
finding 358
finds 948
fine 1324
finest 278
finish 410
finished 302
fire 632
first 9061
fish 157
fit 489
fits 215
five 933
flash 149
flashback 176
flashbacks 240
flat 577
flawed 156
flaws 362
flesh 247
flick 1258
flicks 357
flight 177
floor 281
flow 161
fly 232
flying 3

reviewers 267
reviews 717
revolution 188
revolves 154
rich 587
richard 847
ride 413
ridiculous 964
riding 155
right 3312
rights 184
ring 312
rings 185
rip 323
rise 235
risk 162
rival 161
river 289
road 435
rob 252
robert 951
robin 248
robot 217
rochester 155
rock 876
roger 203
rogers 170
role 3188
roles 1112
roll 338
rolling 183
romance 694
romantic 854
ron 183
room 945
rose 244
rough 186
round 244
routine 201
roy 228
rubbish 275
ruin 203
ruined 227
rule 188
rules 231
run 1218
running 992
runs 513
russell 208
russian 302
ryan 224
sad 995
sadly 575
safe 227
said 2196
sake 246
sam 456
san 186
sandler 174
santa 276
sarah 192
sat 293
satire 261
satisfying 216
saturday 220
save 1023
saved 276
saving 276
saw 3167
say 5395
saying 946
says 1109
scale 209
scare 219
scared 304
scares 189
scary 988
scenario 183
scene 5377
scenery 407
scenes 5207
school 1659
sci 658
science 549
scientist 337
score 1030
scott 584
scream 268
screaming 271
screen 2493
screening 173
screenplay 695
screenwriter 161
scr

In [18]:
#Apply random forest
from sklearn.ensemble import RandomForestClassifier
model_rf = RandomForestClassifier(n_estimators = 100)
model_rf = model_rf.fit(train_features, train["sentiment"])


In [21]:
#Read test data
test = pd.read_csv("testData.tsv", delimiter = "\t")

test_size = test["review"].size
clean_test = []
for i in xrange(0, test_size):
    if ((i+1) % 5000 == 0):
        print "Review now %d of total Review %d/n" % (i+1, test_size)
    clean_test.append(cleandata(test["review"][i]))

test_features = vectorizer.transform(clean_test)
test_features = test_features.toarray()

result = model_rf.predict(test_features)

Review now 5000 of total Review 25000/n
Review now 10000 of total Review 25000/n
Review now 15000 of total Review 25000/n
Review now 20000 of total Review 25000/n
Review now 25000 of total Review 25000/n


In [22]:
#Create outputs
output = pd.DataFrame(data = {"id": test["id"], "sentiment": result})
output.to_csv("Practice2.csv", index = False, quoting = 3)