In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk import bigrams
import itertools
from collections import Counter
import graphlab
from graphlab import SFrame



#Importing the data sets as pandas objects:


In [2]:
df_pos = pd.read_csv('rt_polarity_pos.dat',sep='\t',names=['review'])
df_neg = pd.read_csv('rt_polarity_neg.dat',sep='\t',names=['review'])


#Taking a look at the positive reviews dataframe (df_pos):

In [3]:
df_pos

Unnamed: 0,review
0,the rock is destined to be the 21st century's ...
1,"the gorgeously elaborate continuation of "" the..."
2,effective but too-tepid biopic
3,if you sometimes like to go to the movies to h...
4,"emerges as something rare , an issue movie tha..."
5,the film provides some great insight into the ...
6,offers that rare combination of entertainment ...
7,perhaps no picture ever made has more literall...
8,steers turns in a snappy screenplay that curls...
9,take care of my cat offers a refreshingly diff...


#Taking a random look at the df_pos:

In [4]:
df_pos.sample(10,random_state=20)

Unnamed: 0,review
1477,"if you're in the mood for a bollywood film , h..."
3206,one of the best inside-show-biz yarns ever .
2638,"[sports] admirable energy , full-bodied charac..."
4787,graphic sex may be what's attracting audiences...
1676,cho's latest comic set isn't as sharp or as fr...
4473,"a compelling , gut-clutching piece of advocacy..."
327,hands down the year's most thought-provoking f...
2062,one hour photo may seem disappointing in its g...
3874,"brash , intelligent and erotically perplexing ..."
254,this is a film brimming with detail and nuance...


#Just taking a look at some full positive reviews..

In [5]:
for k in xrange(0,5):
    print df_pos['review'][k]
    print ' '


the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal . 
 
the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson's expanded vision of j . r . r . tolkien's middle-earth . 
 
effective but too-tepid biopic
 
if you sometimes like to go to the movies to have fun , wasabi is a good place to start . 
 
emerges as something rare , an issue movie that's so honest and keenly observed that it doesn't feel like one . 
 


#Now, the negative reviews dataframe (df_neg):

In [6]:
df_neg

Unnamed: 0,review
0,"simplistic , silly and tedious ."
1,"it's so laddish and juvenile , only teenage bo..."
2,exploitative and largely devoid of the depth o...
3,[garbus] discards the potential for pathologic...
4,a visually flashy but narratively opaque and e...
5,"the story is also as unoriginal as they come ,..."
6,about the only thing to give the movie points ...
7,not so much farcical as sour .
8,unfortunately the story and the actors are ser...
9,all the more disquieting for its relatively go...


#Taking a random look at the df_neg:

In [7]:
df_neg.sample(10,random_state=20)

Unnamed: 0,review
1477,the comedy death to smoochy is a rancorous cur...
3206,the tuxedo miscalculates badly by forcing the ...
2638,looks awfully like one long tourist spot for a...
4787,what begins brightly gets bogged down over 140...
1676,"too often , the viewer isn't reacting to humor..."
4473,""" abandon "" will leave you wanting to abandon..."
327,"[a] soulless , stupid sequel . . ."
2062,moderately involving despite bargain-basement ...
3874,neither as scary-funny as tremors nor demented...
254,"unintelligible , poorly acted , brain-slapping..."


#Just taking a look at some full negative reviews..

In [8]:
for k in xrange(0,5):
    print df_neg['review'][k]
    print ' '

simplistic , silly and tedious . 
 
it's so laddish and juvenile , only teenage boys could possibly find it funny . 
 
exploitative and largely devoid of the depth or sophistication that would make watching such a graphic treatment of the crimes bearable . 
 
[garbus] discards the potential for pathological study , exhuming instead , the skewed melodrama of the circumstantial situation . 
 
a visually flashy but narratively opaque and emotionally vapid exercise in style and mystification . 
 


#Defining the column 'value' in the above dataframes. Value = 0 means a negative review, while Valeu = 1 means a positive review

In [9]:
df_pos['value'] = 1
df_neg['value'] = 0

#Taking a look again...

In [10]:
df_pos

Unnamed: 0,review,value
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1
5,the film provides some great insight into the ...,1
6,offers that rare combination of entertainment ...,1
7,perhaps no picture ever made has more literall...,1
8,steers turns in a snappy screenplay that curls...,1
9,take care of my cat offers a refreshingly diff...,1


In [11]:
df_neg

Unnamed: 0,review,value
0,"simplistic , silly and tedious .",0
1,"it's so laddish and juvenile , only teenage bo...",0
2,exploitative and largely devoid of the depth o...,0
3,[garbus] discards the potential for pathologic...,0
4,a visually flashy but narratively opaque and e...,0
5,"the story is also as unoriginal as they come ,...",0
6,about the only thing to give the movie points ...,0
7,not so much farcical as sour .,0
8,unfortunately the story and the actors are ser...,0
9,all the more disquieting for its relatively go...,0


In [12]:
type(df_pos)

pandas.core.frame.DataFrame

In [13]:
df_pos.values

array([[ 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal . ',
        1],
       [ 'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth . ',
        1],
       ['effective but too-tepid biopic', 1],
       ..., 
       [ 'standing in the shadows of motown is the best kind of documentary , one that makes a depleted yesterday feel very much like a brand-new tomorrow . ',
        1],
       [ "it's nice to see piscopo again after all these years , and chaykin and headly are priceless . ",
        1],
       [ 'provides a porthole into that noble , trembling incoherence that defines us all . ',
        1]], dtype=object)

In [14]:
df_neg.values

array([['simplistic , silly and tedious . ', 0],
       [ "it's so laddish and juvenile , only teenage boys could possibly find it funny . ",
        0],
       [ 'exploitative and largely devoid of the depth or sophistication that would make watching such a graphic treatment of the crimes bearable . ',
        0],
       ..., 
       [ "as it stands , crocodile hunter has the hurried , badly cobbled look of the 1959 godzilla , which combined scenes of a japanese monster flick with canned shots of raymond burr commenting on the monster's path of destruction . ",
        0],
       ['the thing looks like a made-for-home-video quickie . ', 0],
       ["enigma is well-made , but it's just too dry and too placid . ", 0]], dtype=object)

#Just checking the lenght of the dataframes:

In [15]:
len(df_pos['review'][1:])

5330

In [16]:
len(df_neg['review'][1:])

5330

#Confirming the data type of the reviews:

In [17]:
type(df_pos['review'][0])

str



#Defining some characters to be erased from the reviews:

In [18]:
    toerase =  ['.',',','!','?',';','@',
               '...','/','(',')','"',':',
               '=','#','-','&','|','%','*','"',"'s",'[', ']'," ' ", "'ve","n't"]

#Collecting the reviews in order to erase the aforementioned characters:

In [19]:
texts_pos = []
texts_neg = []
for j in xrange(0,len(df_pos['review'])):
    texts_pos.append(df_pos['review'][j])
    texts_neg.append(df_neg['review'][j])




In [20]:
for k in xrange(0,len(texts_pos)):
    for p in toerase:
        texts_pos[k] = texts_pos[k].replace(p, '')
        texts_neg[k] = texts_neg[k].replace(p, '')

#Now, let's tokenize and filter the stopwords in english (I've used the dict function in order to convert the Counter type to standard dict type)

"f" means "filtered", which accounts for texts whose stopwords have been removed
"nf" means "non filtered", which accounts for texts whose stopwords haven't been removed. I will use this to comapre the accuracy of the model.


In [21]:
ftexts_pos = []
nftexts_pos = []
count_ftexts_pos = []
count_nftexts_pos = []

ftexts_neg = []
nftexts_neg = []
count_ftexts_neg = []
count_nftexts_neg = []

for j in xrange(0,len(texts_pos)):
    ftexts_pos.append([w for w in word_tokenize(texts_pos[j]) if not w in stopwords.words('english')])
    nftexts_pos.append([w for w in word_tokenize(texts_pos[j])])
    count_ftexts_pos.append(Counter(ftexts_pos[j]))
    count_nftexts_pos.append(Counter(nftexts_pos[j]))
    
    ftexts_neg.append([w for w in word_tokenize(texts_neg[j]) if not w in stopwords.words('english')])
    nftexts_neg.append([w for w in word_tokenize(texts_neg[j])])
    count_ftexts_neg.append(Counter(ftexts_neg[j]))
    count_nftexts_neg.append(Counter(nftexts_neg[j]))


dict_ftexts_pos = []
dict_nftexts_pos = []

dict_ftexts_neg = []
dict_nftexts_neg = []

for j in xrange(0,len(texts_pos)):
    dict_ftexts_pos.append(dict(count_ftexts_pos[j]))
    dict_nftexts_pos.append(dict(count_nftexts_pos[j]))
    
    dict_ftexts_neg.append(dict(count_ftexts_neg[j]))
    dict_nftexts_neg.append(dict(count_nftexts_neg[j]))






#checking some tokenized/filtered reviews:

In [22]:
print ftexts_pos[3]
print ' '
print texts_pos[3]

['sometimes', 'like', 'go', 'movies', 'fun', 'wasabi', 'good', 'place', 'start']
 
if you sometimes like to go to the movies to have fun  wasabi is a good place to start  


In [23]:
print ftexts_neg[20]
print ' '
print texts_neg[20]

['execution', 'pedestrian', 'positive', 'comment', 'make', 'rob', 'schneider', 'actually', 'turns', 'pretty', 'convincing', 'performance', 'prissy', 'teenage', 'girl']
 
the execution is so pedestrian that the most positive comment we can make is that rob schneider actually turns in a pretty convincing performance as a prissy teenage girl  


#Dropping all words in a bag (from the positive reviews, in a positive bag, from the negative ones, in a negative bag), only for curiosity. Let's analyze the lexical diversity of positive and negative reviews.

In [24]:
pos_bag = list(itertools.chain(*ftexts_pos))
neg_bag = list(itertools.chain(*ftexts_neg))

In [25]:
print pos_bag[0:20]

['rock', 'destined', '21st', 'century', 'new', 'conan', 'going', 'make', 'splash', 'even', 'greater', 'arnold', 'schwarzenegger', 'jeanclaud', 'van', 'damme', 'steven', 'segal', 'gorgeously', 'elaborate']


#How many words in the positive reviews bag?

In [26]:
len(pos_bag)

56902

In [27]:
counts_pos_bag = Counter(pos_bag)

In [28]:
print dict(counts_pos_bag)



In [29]:
neg_bag[0:20]

['simplistic',
 'silly',
 'tedious',
 'laddish',
 'juvenile',
 'teenage',
 'boys',
 'could',
 'possibly',
 'find',
 'funny',
 'exploitative',
 'largely',
 'devoid',
 'depth',
 'sophistication',
 'would',
 'make',
 'watching',
 'graphic']

#How many words in the negative reviews bag?

In [30]:
len(neg_bag)

55868

Let's calculate the frequency distribution and the lexical diversity of the positive bag:

In [31]:
FreqDist(pos_bag).most_common(20)

[('film', 899),
 ('movie', 538),
 ('one', 370),
 ('like', 281),
 ('story', 261),
 ("'", 222),
 ('good', 198),
 ('comedy', 193),
 ('funny', 181),
 ('even', 180),
 ('best', 166),
 ('way', 164),
 ('time', 160),
 ('us', 156),
 ('much', 154),
 ('characters', 154),
 ('love', 152),
 ('life', 151),
 ('make', 151),
 ('makes', 144)]

In [32]:
print 'lexical diversity: ', float(len(set(pos_bag)))/float(len(pos_bag))

lexical diversity:  0.23324311975


#Let's take a look at the most frequent bigrams of the positive bag:

In [33]:
pos_bag_big = list(bigrams(pos_bag))

In [34]:
FreqDist(pos_bag_big).most_common(20)

[(('romantic', 'comedy'), 38),
 (('one', 'best'), 25),
 (('love', 'story'), 22),
 (('subject', 'matter'), 19),
 (('good', 'time'), 17),
 (('character', 'study'), 15),
 (('special', 'effects'), 14),
 (('ever', 'made'), 14),
 (('best', 'films'), 13),
 (('worth', 'seeing'), 13),
 (('new', 'york'), 12),
 (('even', 'though'), 12),
 (('little', 'film'), 12),
 (('characters', "'"), 11),
 (('film', 'one'), 11),
 (('ever', 'seen'), 11),
 (('big', 'screen'), 11),
 (('one', 'year'), 10),
 (('one', 'man'), 10),
 (('one', 'thing'), 10)]

#And now for the negative bag:

In [35]:
FreqDist(neg_bag).most_common(20)

[('movie', 796),
 ('film', 690),
 ('like', 439),
 ('one', 369),
 ("'", 260),
 ('much', 232),
 ('story', 232),
 ('bad', 207),
 ('even', 202),
 ('time', 181),
 ('good', 179),
 ('characters', 176),
 ('little', 173),
 ('would', 165),
 ('comedy', 162),
 ('never', 152),
 ('enough', 140),
 ('way', 132),
 ('really', 130),
 ('could', 128)]

In [36]:
print 'lexical diversity: ', float(len(set(neg_bag)))/float(len(neg_bag))

lexical diversity:  0.245972649817


#Very similar lexical diversity... Coincidence?

In [37]:
neg_bag_big = list(bigrams(neg_bag))

In [38]:
FreqDist(neg_bag_big).most_common(20)

[(('feels', 'like'), 43),
 (('plays', 'like'), 31),
 (('romantic', 'comedy'), 24),
 (('soap', 'opera'), 20),
 (('feel', 'like'), 19),
 (('special', 'effects'), 18),
 (('90', 'minutes'), 18),
 (('two', 'hours'), 17),
 (('running', 'time'), 17),
 (('never', 'quite'), 16),
 (('de', 'niro'), 15),
 (('bad', 'movie'), 15),
 (('kind', 'movie'), 14),
 (('never', 'really'), 13),
 (('new', 'york'), 12),
 (('big', 'screen'), 12),
 (('subject', 'matter'), 12),
 (('film', 'never'), 12),
 (('action', 'movie'), 12),
 (('look', 'like'), 12)]

#Adding new columns to the data frame. Now we have also the filtered texts of the reviews, a dictionary counting the words in each review, the tokenized but not filtered reviews, and the words dictionary counting the words for the unfiltered reviews.

In [39]:
df_pos['filtered'] = ftexts_pos
df_pos['counter'] = dict_ftexts_pos
df_pos['non_filtered'] = nftexts_pos
df_pos['counter_non_filtered'] = dict_nftexts_pos

In [40]:
df_pos

Unnamed: 0,review,value,filtered,counter,non_filtered,counter_non_filtered
0,the rock is destined to be the 21st century's ...,1,"[rock, destined, 21st, century, new, conan, go...","{u'greater': 1, u'century': 1, u'going': 1, u'...","[the, rock, is, destined, to, be, the, 21st, c...","{u'and': 1, u'be': 1, u'greater': 1, u'century..."
1,"the gorgeously elaborate continuation of "" the...",1,"[gorgeously, elaborate, continuation, lord, ri...","{u'gorgeously': 1, u'expanded': 1, u'words': 1...","[the, gorgeously, elaborate, continuation, of,...","{u'gorgeously': 1, u'trilogy': 1, u'words': 1,..."
2,effective but too-tepid biopic,1,"[effective, tootepid, biopic]","{u'biopic': 1, u'effective': 1, u'tootepid': 1}","[effective, but, tootepid, biopic]","{u'effective': 1, u'biopic': 1, u'but': 1, u't..."
3,if you sometimes like to go to the movies to h...,1,"[sometimes, like, go, movies, fun, wasabi, goo...","{u'good': 1, u'like': 1, u'movies': 1, u'somet...","[if, you, sometimes, like, to, go, to, the, mo...","{u'good': 1, u'to': 4, u'is': 1, u'wasabi': 1,..."
4,"emerges as something rare , an issue movie tha...",1,"[emerges, something, rare, issue, movie, hones...","{u'emerges': 1, u'rare': 1, u'like': 1, u'keen...","[emerges, as, something, rare, an, issue, movi...","{u'and': 1, u'that': 2, u'feel': 1, u'it': 1, ..."
5,the film provides some great insight into the ...,1,"[film, provides, great, insight, neurotic, min...","{u'even': 1, u'great': 1, u'comics': 1, u'top'...","[the, film, provides, some, great, insight, in...","{u'all': 1, u'insight': 1, u'into': 1, u'who':..."
6,offers that rare combination of entertainment ...,1,"[offers, rare, combination, entertainment, edu...","{u'offers': 1, u'rare': 1, u'education': 1, u'...","[offers, that, rare, combination, of, entertai...","{u'and': 1, u'rare': 1, u'combination': 1, u'e..."
7,perhaps no picture ever made has more literall...,1,"[perhaps, picture, ever, made, literally, show...","{u'picture': 1, u'made': 1, u'showed': 1, u'pe...","[perhaps, no, picture, ever, made, has, more, ...","{u'picture': 1, u'good': 1, u'that': 1, u'is':..."
8,steers turns in a snappy screenplay that curls...,1,"[steers, turns, snappy, screenplay, curls, edg...","{u'screenplay': 1, u'clever': 1, u'steers': 1,...","[steers, turns, in, a, snappy, screenplay, tha...","{u'screenplay': 1, u'clever': 1, u'steers': 1,..."
9,take care of my cat offers a refreshingly diff...,1,"[take, care, cat, offers, refreshingly, differ...","{u'different': 1, u'slice': 1, u'cinema': 1, u...","[take, care, of, my, cat, offers, a, refreshin...","{u'a': 1, u'different': 1, u'slice': 1, u'cine..."


In [41]:
df_neg['filtered'] = ftexts_neg
df_neg['counter'] = dict_ftexts_neg
df_neg['non_filtered'] = nftexts_neg
df_neg['counter_non_filtered'] = dict_nftexts_neg

In [42]:
df_neg

Unnamed: 0,review,value,filtered,counter,non_filtered,counter_non_filtered
0,"simplistic , silly and tedious .",0,"[simplistic, silly, tedious]","{u'tedious': 1, u'simplistic': 1, u'silly': 1}","[simplistic, silly, and, tedious]","{u'and': 1, u'simplistic': 1, u'tedious': 1, u..."
1,"it's so laddish and juvenile , only teenage bo...",0,"[laddish, juvenile, teenage, boys, could, poss...","{u'funny': 1, u'juvenile': 1, u'boys': 1, u'co...","[it, so, laddish, and, juvenile, only, teenage...","{u'and': 1, u'funny': 1, u'juvenile': 1, u'boy..."
2,exploitative and largely devoid of the depth o...,0,"[exploitative, largely, devoid, depth, sophist...","{u'graphic': 1, u'would': 1, u'largely': 1, u'...","[exploitative, and, largely, devoid, of, the, ...","{u'and': 1, u'watching': 1, u'crimes': 1, u'so..."
3,[garbus] discards the potential for pathologic...,0,"[garbus, discards, potential, pathological, st...","{u'discards': 1, u'circumstantial': 1, u'study...","[garbus, discards, the, potential, for, pathol...","{u'discards': 1, u'circumstantial': 1, u'for':..."
4,a visually flashy but narratively opaque and e...,0,"[visually, flashy, narratively, opaque, emotio...","{u'visually': 1, u'style': 1, u'emotionally': ...","[a, visually, flashy, but, narratively, opaque...","{u'a': 1, u'visually': 1, u'and': 2, u'emotion..."
5,"the story is also as unoriginal as they come ,...",0,"[story, also, unoriginal, come, already, recyc...","{u'count': 1, u'story': 1, u'already': 1, u'al...","[the, story, is, also, as, unoriginal, as, the...","{u'already': 1, u'is': 1, u'unoriginal': 1, u'..."
6,about the only thing to give the movie points ...,0,"[thing, give, movie, points, bravado, take, en...","{u'concept': 1, u'grinder': 1, u'meat': 1, u'g...","[about, the, only, thing, to, give, the, movie...","{u'and': 1, u'concept': 1, u'give': 1, u'is': ..."
7,not so much farcical as sour .,0,"[much, farcical, sour]","{u'much': 1, u'sour': 1, u'farcical': 1}","[not, so, much, farcical, as, sour]","{u'as': 1, u'sour': 1, u'so': 1, u'farcical': ..."
8,unfortunately the story and the actors are ser...,0,"[unfortunately, story, actors, served, hack, s...","{u'story': 1, u'actors': 1, u'unfortunately': ...","[unfortunately, the, story, and, the, actors, ...","{u'and': 1, u'a': 1, u'story': 1, u'unfortunat..."
9,all the more disquieting for its relatively go...,0,"[disquieting, relatively, gorefree, allusions,...","{u'murders': 1, u'relatively': 1, u'gorefree':...","[all, the, more, disquieting, for, its, relati...","{u'all': 1, u'relatively': 1, u'it': 1, u'but'..."


#Now, let's merge the two dataframes: the negative (df_neg) and the positive (def_pos) ones:

In [43]:
df = pd.concat([df_pos,df_neg],ignore_index=True)

In [44]:
df

Unnamed: 0,review,value,filtered,counter,non_filtered,counter_non_filtered
0,the rock is destined to be the 21st century's ...,1,"[rock, destined, 21st, century, new, conan, go...","{u'greater': 1, u'century': 1, u'going': 1, u'...","[the, rock, is, destined, to, be, the, 21st, c...","{u'and': 1, u'be': 1, u'greater': 1, u'century..."
1,"the gorgeously elaborate continuation of "" the...",1,"[gorgeously, elaborate, continuation, lord, ri...","{u'gorgeously': 1, u'expanded': 1, u'words': 1...","[the, gorgeously, elaborate, continuation, of,...","{u'gorgeously': 1, u'trilogy': 1, u'words': 1,..."
2,effective but too-tepid biopic,1,"[effective, tootepid, biopic]","{u'biopic': 1, u'effective': 1, u'tootepid': 1}","[effective, but, tootepid, biopic]","{u'effective': 1, u'biopic': 1, u'but': 1, u't..."
3,if you sometimes like to go to the movies to h...,1,"[sometimes, like, go, movies, fun, wasabi, goo...","{u'good': 1, u'like': 1, u'movies': 1, u'somet...","[if, you, sometimes, like, to, go, to, the, mo...","{u'good': 1, u'to': 4, u'is': 1, u'wasabi': 1,..."
4,"emerges as something rare , an issue movie tha...",1,"[emerges, something, rare, issue, movie, hones...","{u'emerges': 1, u'rare': 1, u'like': 1, u'keen...","[emerges, as, something, rare, an, issue, movi...","{u'and': 1, u'that': 2, u'feel': 1, u'it': 1, ..."
5,the film provides some great insight into the ...,1,"[film, provides, great, insight, neurotic, min...","{u'even': 1, u'great': 1, u'comics': 1, u'top'...","[the, film, provides, some, great, insight, in...","{u'all': 1, u'insight': 1, u'into': 1, u'who':..."
6,offers that rare combination of entertainment ...,1,"[offers, rare, combination, entertainment, edu...","{u'offers': 1, u'rare': 1, u'education': 1, u'...","[offers, that, rare, combination, of, entertai...","{u'and': 1, u'rare': 1, u'combination': 1, u'e..."
7,perhaps no picture ever made has more literall...,1,"[perhaps, picture, ever, made, literally, show...","{u'picture': 1, u'made': 1, u'showed': 1, u'pe...","[perhaps, no, picture, ever, made, has, more, ...","{u'picture': 1, u'good': 1, u'that': 1, u'is':..."
8,steers turns in a snappy screenplay that curls...,1,"[steers, turns, snappy, screenplay, curls, edg...","{u'screenplay': 1, u'clever': 1, u'steers': 1,...","[steers, turns, in, a, snappy, screenplay, tha...","{u'screenplay': 1, u'clever': 1, u'steers': 1,..."
9,take care of my cat offers a refreshingly diff...,1,"[take, care, cat, offers, refreshingly, differ...","{u'different': 1, u'slice': 1, u'cinema': 1, u...","[take, care, of, my, cat, offers, a, refreshin...","{u'a': 1, u'different': 1, u'slice': 1, u'cine..."


#A random sample of the new, merged, data frame:

In [45]:
df.sample(20,random_state=20)

Unnamed: 0,review,value,filtered,counter,non_filtered,counter_non_filtered
9928,a movie to forget,0,"[movie, forget]","{u'movie': 1, u'forget': 1}","[a, movie, to, forget]","{u'a': 1, u'movie': 1, u'forget': 1, u'to': 1}"
9450,neither a rousing success nor a blinding embar...,0,"[neither, rousing, success, blinding, embarras...","{u'blinding': 1, u'ordered': 1, u'like': 1, u'...","[neither, a, rousing, success, nor, a, blindin...","{u'ordered': 1, u'just': 1, u'it': 1, u'one': ..."
8005,"director brian levant , who never strays far f...",0,"[director, brian, levant, never, strays, far, ...","{u'ends': 1, u'loose': 1, u'blithely': 1, u'fa...","[director, brian, levant, who, never, strays, ...","{u'up': 1, u'one': 1, u'another': 1, u'find': ..."
5783,"surprisingly , considering that baird is a for...",0,"[surprisingly, considering, baird, former, fil...","{u'considering': 1, u'rather': 1, u'movie': 1,...","[surprisingly, considering, that, baird, is, a...","{u'considering': 1, u'a': 1, u'that': 1, u'rat..."
4537,return to never land may be another shameless ...,1,"[return, never, land, may, another, shameless,...","{u'land': 1, u'return': 1, u'another': 1, u'ma...","[return, to, never, land, may, be, another, sh...","{u'be': 1, u'rake': 1, u'families': 1, u'boome..."
1729,"there are no special effects , and no hollywoo...",1,"[special, effects, hollywood, endings]","{u'hollywood': 1, u'effects': 1, u'special': 1...","[there, are, no, special, effects, and, no, ho...","{u'and': 1, u'no': 2, u'there': 1, u'endings':..."
9777,what's most offensive isn't the waste of a goo...,0,"[offensive, waste, good, cast, film, denial, s...","{u'spiritualism': 1, u'good': 1, u'cast': 1, u...","[what, most, offensive, is, the, waste, of, a,...","{u'and': 1, u'spiritualism': 1, u'good': 1, u'..."
4297,byler is too savvy a filmmaker to let this mor...,1,"[byler, savvy, filmmaker, let, morph, typical,...","{u'traverse': 1, u'focuses': 1, u'triangle': 1...","[byler, is, too, savvy, a, filmmaker, to, let,...","{u'traverse': 1, u'focuses': 1, u'triangle': 1..."
9462,feels as if the inmates have actually taken ov...,0,"[feels, inmates, actually, taken, asylum]","{u'taken': 1, u'feels': 1, u'asylum': 1, u'inm...","[feels, as, if, the, inmates, have, actually, ...","{u'have': 1, u'over': 1, u'feels': 1, u'actual..."
10341,behan's memoir is great material for a film --...,0,"[behan, memoir, great, material, film, rowdy, ...","{u'sheridan': 1, u'great': 1, u'film': 1, u'ma...","[behan, memoir, is, great, material, for, a, f...","{u'and': 1, u'irish': 1, u'material': 1, u'is'..."


#Let's convert pandas data frame to Sframes, since the last one is a out-of-the core implementation of data frame, and can use, if necessary, the hard drive to store data

In [46]:
graphlab.canvas.set_target('ipynb')

In [47]:
sf = SFrame(data=df)

This non-commercial license of GraphLab Create for academic use is assigned to leandro.silva@ufabc.edu.br and will expire on February 16, 2018.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1496990710.log


In [48]:
sf

review,value,filtered,counter
the rock is destined to be the 21st century's ...,1,"[rock, destined, 21st, century, new, conan, ...","{'even': 1, 'segal': 1, '21st': 1, 'van': 1, ..."
"the gorgeously elaborate continuation of "" the ...",1,"[gorgeously, elaborate, continuation, lord, ...","{'gorgeously': 1, 'huge': 1, 'r': 2, 'describe' ..."
effective but too-tepid biopic ...,1,"[effective, tootepid, biopic] ...","{'biopic': 1, 'tootepid': 1, 'effective': 1} ..."
if you sometimes like to go to the movies to have ...,1,"[sometimes, like, go, movies, fun, wasabi, ...","{'fun': 1, 'good': 1, 'like': 1, 'start': 1, ..."
"emerges as something rare , an issue movie that's ...",1,"[emerges, something, rare, issue, movie, ...","{'emerges': 1, 'rare': 1, 'like': 1, 'keenly': 1, ..."
the film provides some great insight into the ...,1,"[film, provides, great, insight, neurotic, ...","{'even': 1, 'great': 1, 'comics': 1, 'top': 1, ..."
offers that rare combination of ...,1,"[offers, rare, combination, ...","{'offers': 1, 'rare': 1, 'education': 1, ..."
perhaps no picture ever made has more literally ...,1,"[perhaps, picture, ever, made, literally, showed, ...","{'picture': 1, 'made': 1, 'showed': 1, 'perhaps': ..."
steers turns in a snappy screenplay that curls at ...,1,"[steers, turns, snappy, screenplay, curls, ed ...","{'screenplay': 1, 'clever': 1, 'steers' ..."
take care of my cat offers a refreshingly ...,1,"[take, care, cat, offers, refreshingly, different, ...","{'different': 1, 'slice': 1, 'cinema': 1, 'cat' ..."

non_filtered,counter_non_filtered
"[the, rock, is, destined, to, be, the, 21st, ...","{'and': 1, 'century': 1, 'is': 1, 'conan': 1, ..."
"[the, gorgeously, elaborate, continuation, ...","{'gorgeously': 1, 'expanded': 1, 'tolki ..."
"[effective, but, tootepid, biopic] ...","{'tootepid': 1, 'biopic': 1, 'but': 1, 'effecti ..."
"[if, you, sometimes, like, to, go, to, the, ...","{'a': 1, 'fun': 1, 'good': 1, 'like': 1, ..."
"[emerges, as, something, rare, an, issue, movie, ...","{'and': 1, 'emerges': 1, 'rare': 1, 'like': 1, ..."
"[the, film, provides, some, great, insight, ...","{'even': 1, 'all': 1, 'who': 1, 'provides': 1, ..."
"[offers, that, rare, combination, of, ...","{'and': 1, 'rare': 1, 'combination': 1, 'th ..."
"[perhaps, no, picture, ever, made, has, more, ...","{'picture': 1, 'good': 1, 'to': 1, 'showed': 1, ..."
"[steers, turns, in, a, snappy, screenplay, t ...","{'screenplay': 1, 'it': 3, 'at': 1, 'want': 1, ..."
"[take, care, of, my, cat, offers, a, refreshingly, ...","{'a': 1, 'different': 1, 'slice': 1, 'cinema': 1, ..."


In [49]:
train_data,test_data = sf.random_split(.8, seed=1)

In [50]:
train_data

review,value,filtered,counter
the rock is destined to be the 21st century's ...,1,"[rock, destined, 21st, century, new, conan, ...","{'even': 1, 'segal': 1, '21st': 1, 'van': 1, ..."
"the gorgeously elaborate continuation of "" the ...",1,"[gorgeously, elaborate, continuation, lord, ...","{'gorgeously': 1, 'huge': 1, 'r': 2, 'describe' ..."
effective but too-tepid biopic ...,1,"[effective, tootepid, biopic] ...","{'biopic': 1, 'tootepid': 1, 'effective': 1} ..."
if you sometimes like to go to the movies to have ...,1,"[sometimes, like, go, movies, fun, wasabi, ...","{'fun': 1, 'good': 1, 'like': 1, 'start': 1, ..."
"emerges as something rare , an issue movie that's ...",1,"[emerges, something, rare, issue, movie, ...","{'emerges': 1, 'rare': 1, 'like': 1, 'keenly': 1, ..."
the film provides some great insight into the ...,1,"[film, provides, great, insight, neurotic, ...","{'even': 1, 'great': 1, 'comics': 1, 'top': 1, ..."
offers that rare combination of ...,1,"[offers, rare, combination, ...","{'offers': 1, 'rare': 1, 'education': 1, ..."
perhaps no picture ever made has more literally ...,1,"[perhaps, picture, ever, made, literally, showed, ...","{'picture': 1, 'made': 1, 'showed': 1, 'perhaps': ..."
"this is a film well worth seeing , talking and ...",1,"[film, well, worth, seeing, talking, sing ...","{'seeing': 1, 'heads': 1, 'well': 1, 'talking': 1, ..."
what really surprises about wisegirls is its ...,1,"[really, surprises, wisegirls, lowkey, ...","{'wisegirls': 1, 'genuine': 1, 'lowkey': ..."

non_filtered,counter_non_filtered
"[the, rock, is, destined, to, be, the, 21st, ...","{'and': 1, 'century': 1, 'is': 1, 'conan': 1, ..."
"[the, gorgeously, elaborate, continuation, ...","{'gorgeously': 1, 'expanded': 1, 'tolki ..."
"[effective, but, tootepid, biopic] ...","{'tootepid': 1, 'biopic': 1, 'but': 1, 'effecti ..."
"[if, you, sometimes, like, to, go, to, the, ...","{'a': 1, 'fun': 1, 'good': 1, 'like': 1, ..."
"[emerges, as, something, rare, an, issue, movie, ...","{'and': 1, 'emerges': 1, 'rare': 1, 'like': 1, ..."
"[the, film, provides, some, great, insight, ...","{'even': 1, 'all': 1, 'who': 1, 'provides': 1, ..."
"[offers, that, rare, combination, of, ...","{'and': 1, 'rare': 1, 'combination': 1, 'th ..."
"[perhaps, no, picture, ever, made, has, more, ...","{'picture': 1, 'good': 1, 'to': 1, 'showed': 1, ..."
"[this, is, a, film, well, worth, seeing, talking, ...","{'a': 1, 'seeing': 1, 'and': 2, 'heads': 1, ..."
"[what, really, surprises, about, wisegirls, is, ...","{'wisegirls': 1, 'and': 1, 'what': 1, 'genuine': ..."


In [51]:
test_data

review,value,filtered,counter
steers turns in a snappy screenplay that curls at ...,1,"[steers, turns, snappy, screenplay, curls, ed ...","{'screenplay': 1, 'clever': 1, 'steers' ..."
take care of my cat offers a refreshingly ...,1,"[take, care, cat, offers, refreshingly, different, ...","{'different': 1, 'slice': 1, 'cinema': 1, 'cat' ..."
"ultimately , it ponders the reasons we need ...",1,"[ultimately, ponders, reasons, need, stories, ...","{'ultimately': 1, 'reasons': 1, 'ponders': ..."
"the movie's ripe , enrapturing beauty will ...",1,"[movie, ripe, enrapturing, beauty, ...","{'enrapturing': 1, 'inscrutable': 1, ..."
scores a few points for doing what it does wi ...,1,"[scores, points, dedicated, goodhearted, ...","{'professionalism': 1, 'dedicated': 1, 'poin ..."
"a masterful film from a master filmmaker , un ...",1,"[masterful, film, master, filmmaker, unique, ...","{'fatalist': 1, 'deceptive': 1, ..."
"light , cute and forgettable . ...",1,"[light, cute, forgettable] ...","{'light': 1, 'forgettable': 1, 'cu ..."
"cantet perfectly captures the hotel lobbies , two- ...",1,"[cantet, perfectly, captures, hotel, lobb ...","{'permeate': 1, 'lobbies': 1, 'cantet': ..."
"though it is by no means his best work , laissez- ...",1,"[though, means, best, work, laissezpasser, ...","{'rewards': 1, 'fascinating': 1, ..."
an engaging overview of johnson's eccentric ...,1,"[engaging, overview, johnson, eccentric, ...","{'engaging': 1, 'overview': 1, 'career': ..."

non_filtered,counter_non_filtered
"[steers, turns, in, a, snappy, screenplay, t ...","{'screenplay': 1, 'it': 3, 'at': 1, 'want': 1, ..."
"[take, care, of, my, cat, offers, a, refreshingly, ...","{'a': 1, 'different': 1, 'slice': 1, 'cinema': 1, ..."
"[ultimately, it, ponders, the, reasons, we, need, ...","{'ultimately': 1, 'reasons': 1, 'it': 1, ..."
"[the, movie, ripe, enrapturing, beauty, ...","{'enrapturing': 1, 'inscrutable': 1, ..."
"[scores, a, few, points, for, doing, what, it, ...","{'a': 2, 'and': 1, 'what': 1, 'for': 1, ..."
"[a, masterful, film, from, a, master, ...","{'a': 2, 'fatalist': 1, 'from': 1, 'worldview': ..."
"[light, cute, and, forgettable] ...","{'and': 1, 'light': 1, 'forgettable': 1, 'cu ..."
"[cantet, perfectly, captures, the, hotel, ...","{'and': 1, 'permeate': 1, 'lobbies': 1, 'cantet': ..."
"[though, it, is, by, no, means, his, best, work, ...","{'and': 1, 'all': 1, 'fascinating': 1, 'is': ..."
"[an, engaging, overview, of, johnson, eccentric, ...","{'johnson': 1, 'career': 1, 'eccentric': 1, 'of': ..."


In [52]:
len(train_data)

8523

In [53]:
len(test_data)

2139

In [54]:
themodel = graphlab.logistic_classifier.create(train_data,
                                                     target='value',l1_penalty=0.6,l2_penalty=0.6,
                                                     features=['counter_non_filtered'],
                                                     validation_set=test_data,max_iterations=100)


In [55]:
themodel.evaluate(test_data, metric='roc_curve')

{'roc_curve': Columns:
 	threshold	float
 	fpr	float
 	tpr	float
 	p	int
 	n	int
 
 Rows: 100001
 
 Data:
 +-----------+----------------+-----+------+------+
 | threshold |      fpr       | tpr |  p   |  n   |
 +-----------+----------------+-----+------+------+
 |    0.0    |      1.0       | 1.0 | 1108 | 1031 |
 |   1e-05   | 0.996120271581 | 1.0 | 1108 | 1031 |
 |   2e-05   | 0.996120271581 | 1.0 | 1108 | 1031 |
 |   3e-05   | 0.995150339476 | 1.0 | 1108 | 1031 |
 |   4e-05   | 0.994180407371 | 1.0 | 1108 | 1031 |
 |   5e-05   | 0.994180407371 | 1.0 | 1108 | 1031 |
 |   6e-05   | 0.993210475267 | 1.0 | 1108 | 1031 |
 |   7e-05   | 0.992240543162 | 1.0 | 1108 | 1031 |
 |   8e-05   | 0.992240543162 | 1.0 | 1108 | 1031 |
 |   9e-05   | 0.991270611057 | 1.0 | 1108 | 1031 |
 +-----------+----------------+-----+------+------+
 [100001 rows x 5 columns]
 Note: Only the head of the SFrame is printed.
 You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.}

In [56]:
themodel.show(view='Evaluation')

In [57]:
test_data_result = test_data
test_data_result['predicted_sentiment'] = themodel.predict(test_data_result, output_type='probability')
test_data_result.swap_columns('review','predicted_sentiment')

predicted_sentiment,value,filtered,counter,non_filtered
0.941682695209,1,"[steers, turns, snappy, screenplay, curls, ed ...","{'screenplay': 1, 'clever': 1, 'steers' ...","[steers, turns, in, a, snappy, screenplay, t ..."
0.999646908068,1,"[take, care, cat, offers, refreshingly, different, ...","{'different': 1, 'slice': 1, 'cinema': 1, 'cat' ...","[take, care, of, my, cat, offers, a, refreshingly, ..."
0.195274892126,1,"[ultimately, ponders, reasons, need, stories, ...","{'ultimately': 1, 'reasons': 1, 'ponders': ...","[ultimately, it, ponders, the, reasons, we, need, ..."
0.949903756671,1,"[movie, ripe, enrapturing, beauty, ...","{'enrapturing': 1, 'inscrutable': 1, ...","[the, movie, ripe, enrapturing, beauty, ..."
0.0759708179871,1,"[scores, points, dedicated, goodhearted, ...","{'professionalism': 1, 'dedicated': 1, 'poin ...","[scores, a, few, points, for, doing, what, it, ..."
0.998241119446,1,"[masterful, film, master, filmmaker, unique, ...","{'fatalist': 1, 'deceptive': 1, ...","[a, masterful, film, from, a, master, ..."
0.44351931096,1,"[light, cute, forgettable] ...","{'light': 1, 'forgettable': 1, 'cu ...","[light, cute, and, forgettable] ..."
0.989239791429,1,"[cantet, perfectly, captures, hotel, lobb ...","{'permeate': 1, 'lobbies': 1, 'cantet': ...","[cantet, perfectly, captures, the, hotel, ..."
0.999892275235,1,"[though, means, best, work, laissezpasser, ...","{'rewards': 1, 'fascinating': 1, ...","[though, it, is, by, no, means, his, best, work, ..."
0.565620704897,1,"[engaging, overview, johnson, eccentric, ...","{'engaging': 1, 'overview': 1, 'career': ...","[an, engaging, overview, of, johnson, eccentric, ..."

counter_non_filtered,review
"{'screenplay': 1, 'it': 3, 'at': 1, 'want': 1, ...",steers turns in a snappy screenplay that curls at ...
"{'a': 1, 'different': 1, 'slice': 1, 'cinema': 1, ...",take care of my cat offers a refreshingly ...
"{'ultimately': 1, 'reasons': 1, 'it': 1, ...","ultimately , it ponders the reasons we need ..."
"{'enrapturing': 1, 'inscrutable': 1, ...","the movie's ripe , enrapturing beauty will ..."
"{'a': 2, 'and': 1, 'what': 1, 'for': 1, ...",scores a few points for doing what it does wi ...
"{'a': 2, 'fatalist': 1, 'from': 1, 'worldview': ...","a masterful film from a master filmmaker , un ..."
"{'and': 1, 'light': 1, 'forgettable': 1, 'cu ...","light , cute and forgettable . ..."
"{'and': 1, 'permeate': 1, 'lobbies': 1, 'cantet': ...","cantet perfectly captures the hotel lobbies , two- ..."
"{'and': 1, 'all': 1, 'fascinating': 1, 'is': ...","though it is by no means his best work , laissez- ..."
"{'johnson': 1, 'career': 1, 'eccentric': 1, 'of': ...",an engaging overview of johnson's eccentric ...


#Ordering from the most positive to the most negative review:

In [58]:
reordered_test_data_result= test_data_result.sort('predicted_sentiment',ascending = False)


In [59]:
reordered_test_data_result

predicted_sentiment,value,filtered,counter,non_filtered
0.999999894298,1,"[witty, dialog, realistic, characters, ...","{'blended': 1, 'laughs': 1, 'one': 1, 'proves' ...","[witty, dialog, between, realistic, characters, ..."
0.999999386593,1,"[film, moody, oozing, chilling, heartwarming, ...","{'thriller': 1, 'heartwarming': 1, ...","[the, film, is, moody, oozing, chilling, and, ..."
0.999997940024,1,"[mixes, likeable, personalities, invent ...","{'picture': 1, 'subculture': 1, ...","[mixes, likeable, personalities, invent ..."
0.999995237641,1,"[rich, detail, gorgeously, shot, ...","{'gorgeously': 1, 'unusual': 1, ...","[rich, in, detail, gorgeously, shot, and, ..."
0.999991809716,1,"[pan, nalin, exposition, beautiful, mysterious, ...","{'beautiful': 1, 'nalin': 1, 'ancient': 1, ...","[pan, nalin, exposition, is, beautiful, and, ..."
0.999984291817,1,"[flatout, amusing, sometimes, endearing, ...","{'wide': 1, 'dialogue': 1, 'noteworthy': 1, ...","[flatout, amusing, sometimes, endearing, ..."
0.999983452623,1,"[denis, cowriter, michele, petin, ...","{'screenplay': 1, 'superlative': 1, ...","[denis, and, cowriter, michele, petin, ..."
0.999975226762,1,"[filled, honest, performances, ...","{'exceptional': 1, 'performances': 1, ...","[filled, with, honest, performances, and, ..."
0.999974847325,1,"[like, english, patient, unbearable, lightness, ...","{'picture': 1, 'right': 1, 'patient': 1, ...","[like, the, english, patient, and, the, ..."
0.999962341817,0,"[like, grinning, jack, ', lantern, apparent, glee, ...","{'discarded': 1, 'glee': 1, 'like': 1, ""'"": 1, ...","[like, a, grinning, jack, o, ', lantern, its, ..."

counter_non_filtered,review
"{'and': 2, 'blended': 1, 'into': 1, 'laughs': 1, ...",witty dialog between realistic characters ...
"{'and': 1, 'a': 1, 'all': 1, 'film': 1, 'once': 1, ...","the film is moody , oozing , chilling and ..."
"{'and': 3, 'inventive': 1, 'is': 1, 'at': 1, ...","mixes likeable personalities , inven ..."
"{'gorgeously': 1, 'and': 2, 'unusual': 1, ...","rich in detail , gorgeously shot and ..."
"{'beautiful': 1, 'and': 3, 'is': 1, 'as': 2, ...",pan nalin's exposition is beautiful and mysterious ...
"{'and': 2, 'a': 2, 'dialogue': 1, 'often': ...",". . . flat-out amusing , sometimes endearing and ..."
"{'and': 3, 'screenplay': 1, 'parmentier': 1, ...",denis and co-writer michele petin's ...
"{'and': 1, 'a': 2, 'to': 1, 'exceptional': 1, ...",filled with honest performances and ...
"{'and': 1, 'emerge': 1, 'own': 1, 'reputedly' ...",like the english patient and the unbearable ...
"{'and': 1, 'all': 1, 'scooped': 1, 'essence': ...","like a grinning jack o' lantern , its apparent ..."


#The top 3 positive reviews:

In [60]:
reordered_test_data_result['review','value','predicted_sentiment'][:3]

review,value,predicted_sentiment
witty dialog between realistic characters ...,1,0.999999894298
"the film is moody , oozing , chilling and ...",1,0.999999386593
"mixes likeable personalities , inven ...",1,0.999997940024


#The three most negative reviews:

In [61]:
reordered_test_data_result['review','value','predicted_sentiment'][-3:]

review,value,predicted_sentiment
"a dreary , incoherent , self-indulgent mess of a ...",0,3.29363275063e-06
elmo touts his drug as being 51 times stronger ...,0,1.80109507245e-06
the film has a few cute ideas and several modest ...,0,1.55554310887e-06


#The most positive review: 

In [62]:
reordered_test_data_result['review','value','predicted_sentiment'][0]

{'predicted_sentiment': 0.9999998942978373,
 'review': "witty dialog between realistic characters showing honest emotions . it's touching and tender and proves that even in sorrow you can find humor . like blended shades of lipstick , these components combine into one terrific story with lots of laughs . ",
 'value': 1}

#The most negative review

In [63]:
reordered_test_data_result['review','value','predicted_sentiment'][-1]

{'predicted_sentiment': 1.5555431088745688e-06,
 'review': "the film has a few cute ideas and several modest chuckles but it isn't exactly kiddie-friendly\xe2\x80\xa6 alas , santa is more ho-hum than ho-ho-ho and the snowman ( who never gets to play that flute ) has all the charm of a meltdown . ",
 'value': 0}

In [64]:
train_data



review,value,filtered,counter
the rock is destined to be the 21st century's ...,1,"[rock, destined, 21st, century, new, conan, ...","{'even': 1, 'segal': 1, '21st': 1, 'van': 1, ..."
"the gorgeously elaborate continuation of "" the ...",1,"[gorgeously, elaborate, continuation, lord, ...","{'gorgeously': 1, 'huge': 1, 'r': 2, 'describe' ..."
effective but too-tepid biopic ...,1,"[effective, tootepid, biopic] ...","{'biopic': 1, 'tootepid': 1, 'effective': 1} ..."
if you sometimes like to go to the movies to have ...,1,"[sometimes, like, go, movies, fun, wasabi, ...","{'fun': 1, 'good': 1, 'like': 1, 'start': 1, ..."
"emerges as something rare , an issue movie that's ...",1,"[emerges, something, rare, issue, movie, ...","{'emerges': 1, 'rare': 1, 'like': 1, 'keenly': 1, ..."
the film provides some great insight into the ...,1,"[film, provides, great, insight, neurotic, ...","{'even': 1, 'great': 1, 'comics': 1, 'top': 1, ..."
offers that rare combination of ...,1,"[offers, rare, combination, ...","{'offers': 1, 'rare': 1, 'education': 1, ..."
perhaps no picture ever made has more literally ...,1,"[perhaps, picture, ever, made, literally, showed, ...","{'picture': 1, 'made': 1, 'showed': 1, 'perhaps': ..."
"this is a film well worth seeing , talking and ...",1,"[film, well, worth, seeing, talking, sing ...","{'seeing': 1, 'heads': 1, 'well': 1, 'talking': 1, ..."
what really surprises about wisegirls is its ...,1,"[really, surprises, wisegirls, lowkey, ...","{'wisegirls': 1, 'genuine': 1, 'lowkey': ..."

non_filtered,counter_non_filtered
"[the, rock, is, destined, to, be, the, 21st, ...","{'and': 1, 'century': 1, 'is': 1, 'conan': 1, ..."
"[the, gorgeously, elaborate, continuation, ...","{'gorgeously': 1, 'expanded': 1, 'tolki ..."
"[effective, but, tootepid, biopic] ...","{'tootepid': 1, 'biopic': 1, 'but': 1, 'effecti ..."
"[if, you, sometimes, like, to, go, to, the, ...","{'a': 1, 'fun': 1, 'good': 1, 'like': 1, ..."
"[emerges, as, something, rare, an, issue, movie, ...","{'and': 1, 'emerges': 1, 'rare': 1, 'like': 1, ..."
"[the, film, provides, some, great, insight, ...","{'even': 1, 'all': 1, 'who': 1, 'provides': 1, ..."
"[offers, that, rare, combination, of, ...","{'and': 1, 'rare': 1, 'combination': 1, 'th ..."
"[perhaps, no, picture, ever, made, has, more, ...","{'picture': 1, 'good': 1, 'to': 1, 'showed': 1, ..."
"[this, is, a, film, well, worth, seeing, talking, ...","{'a': 1, 'seeing': 1, 'and': 2, 'heads': 1, ..."
"[what, really, surprises, about, wisegirls, is, ...","{'wisegirls': 1, 'and': 1, 'what': 1, 'genuine': ..."
