<center><h1>Bag of Words and TF-IDF</h1></center>

In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import re
from bs4 import BeautifulSoup

from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

In [23]:
data = pd.read_csv('labeledTrainData.tsv', header=0, delimiter='\t', quoting=3)

In [3]:
data.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [4]:
data.shape

(25000, 3)

In [5]:
print(data.review[0])

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally sta

## Cleaning the texts

In [24]:
stops = set(stopwords.words("english")) # searching a set is much faster than a list

def review_to_words(raw_review, stemmer=None):
    """Function to convert a raw review to a string of words
    The input is a single string (a raw movie review), and 
    the output is a single string (a preprocessed movie review)
    """
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review).get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. Remove stop words
    if stemmer is not None:
        meaningful_words = [stemmer.stem(w) for w in words if not w in stops]
    else:
        meaningful_words = [w for w in words if not w in stops]
    #
    # 5. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join(meaningful_words))

def clean_dataset(dataset, stemming=False):
    """Loops the previous function to clean an entire dataset"""
    if stemming:
        stemmer = SnowballStemmer("english")
    else:
        stemmer = None
        
    new = dataset
    revs = []
    for rev in new.review:
        revs.append(review_to_words(rev, stemmer))
        
    new.review = revs 
    return new

In [25]:
okdata = clean_dataset(data, stemming=False)



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


In [26]:
okdata.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,stuff going moment mj started listening music ...
1,"""2381_9""",1,classic war worlds timothy hines entertaining ...
2,"""7759_3""",0,film starts manager nicholas bell giving welco...
3,"""3630_4""",0,must assumed praised film greatest filmed oper...
4,"""9495_8""",1,superbly trashy wondrously unpretentious explo...


## Obtaining the features

In [27]:
#vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, \
#                             max_features = 20000)

vectorizer = TfidfVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, \
                             max_features = 20000, ngram_range = (1,2))

vectorizer.fit(okdata.review)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=20000, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [28]:
train, val = train_test_split(okdata, train_size=0.8, test_size=0.2, random_state=42)
x_train, y_train = train.review, train.sentiment
x_val, y_val = val.review, val.sentiment

In [29]:
x_train = vectorizer.transform(x_train).toarray()
x_val = vectorizer.transform(x_val).toarray()
y_train = y_train.as_matrix()
y_val = y_val.as_matrix()

## Creating a model

In [30]:
lr = LogisticRegression(penalty='l2', C=1.0)

lr.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [31]:
preds = lr.predict_proba(x_val)
roc_auc_score(y_val, preds[:,1])

0.95985080098226472

## Submission

In [161]:
test = pd.read_csv('testData.tsv', header=0, delimiter='\t', quoting=3)
test = clean_dataset(test)
x_test = vectorizer.transform(test.review)
pred_test = lr.predict_proba(x_test)



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


In [162]:
result = np.argmax(pred_test, axis=1)

In [163]:
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )
output.to_csv( "submit1.csv", index=False, quoting=3 )

In [166]:
output = pd.DataFrame( data={"id":test["id"], "sentiment":pred_test[:,1]} )
output.to_csv( "submit2.csv", index=False, quoting=3 )