In [8]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import nltk

In [4]:
# Read train data
train = pd.read_csv("labeledTrainData.tsv", delimiter = "\t")
train.shape

(25000, 3)

In [5]:
train.columns.values

array(['id', 'sentiment', 'review'], dtype=object)

In [7]:
train["review"][0]

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally sta

In [10]:
# Try to clean train["review"][0]
removehtml = BeautifulSoup(train["review"][0])
removenonalphabet = re.sub("[^a-zA-Z]", " ", removehtml.get_text())
lowersplit = removenonalphabet.lower().split()

from nltk.corpus import stopwords
removestop = [w for w in lowersplit if not w in stopwords.words("english")]

In [12]:
#Make function to clean data including remove html, punctuations, numbers, stopwords, lower case and split
def cleandata(raw_data):
    removehtml = BeautifulSoup(raw_data).get_text()
    removenonalphabet = re.sub("[^a-zA-Z]", " ", removehtml)
    lowersplit = removenonalphabet.lower().split()
    stops = set(stopwords.words("english"))
    removestop = [w for w in lowersplit if not w in stops]
    return(" ".join(removestop))

# Get the clean_train data
clean_train = []
train_size = train["review"].size
for i in xrange(0, train_size):
    if (i+1) % 5000 == 0:
        print "Reviews now %d in %d\n" % (i+1, train_size)
    clean_train.append(cleandata(train["review"][i]))

Reviews now 5000 in 25000

Reviews now 10000 in 25000

Reviews now 15000 in 25000

Reviews now 20000 in 25000

Reviews now 25000 in 25000



In [14]:
#Make vectorize & features
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer = "word", stop_words = None, preprocessor = None, \
                             tokenizer = None, max_features = 3000)
print vectorizer
print("\n")

train_features = vectorizer.fit_transform(clean_train)
train_features = train_features.toarray()
print train_features

CountVectorizer(analyzer='word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=3000, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)


[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]


In [27]:
#List the features
vocab = vectorizer.get_feature_names()
dist = np.sum(train_features, axis = 0)
print dist
print("\n")

for tag, count in zip(vocab, dist)[:5]:
    print tag, count

[ 187  454 1259 ...,  740  518  147]


abandoned 187
ability 454
able 1259
absolute 352
absolutely 1485


In [18]:
#Apply random forest
from sklearn.ensemble import RandomForestClassifier
model_rf = RandomForestClassifier(n_estimators = 100)
model_rf = model_rf.fit(train_features, train["sentiment"])


In [21]:
#Read test data
test = pd.read_csv("testData.tsv", delimiter = "\t")

test_size = test["review"].size
clean_test = []
for i in xrange(0, test_size):
    if ((i+1) % 5000 == 0):
        print "Review now %d of total Review %d/n" % (i+1, test_size)
    clean_test.append(cleandata(test["review"][i]))

test_features = vectorizer.transform(clean_test)
test_features = test_features.toarray()

result = model_rf.predict(test_features)

Review now 5000 of total Review 25000/n
Review now 10000 of total Review 25000/n
Review now 15000 of total Review 25000/n
Review now 20000 of total Review 25000/n
Review now 25000 of total Review 25000/n


In [28]:
#Create outputs
output = pd.DataFrame(data = {"id": test["id"], "sentiment": result})
output.to_csv("Practice2.csv", index = False, quoting = 3)
print output[:5]

         id  sentiment
0  12311_10          1
1    8348_2          0
2    5828_4          1
3    7186_2          1
4   12128_7          1
