In [1]:
import pandas as pd
import numpy as np

#Leitura do dataset - 50.000 textos rotulados em positivo e negativo

dataset = pd.read_csv(r'movie_reviews.csv')


#Visualização dos títulos das colunas
print(dataset.head())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


Separa em colunas

In [2]:
test_reviews = np.array(dataset['review'])
test_sentiments = np.array(dataset['sentiment'])

Uma amostragem para facilitar o estudo...

In [3]:
sample_docs = [100, 5817, 7626, 7356, 1008, 7155, 3533, 13010]
sample_data = [(test_reviews[index],
                test_sentiments[index])
                  for index in sample_docs]


sample_data        


 

[("This short film that inspired the soon-to-be full length feature - Spatula Madness - is a hilarious piece that contends against similar cartoons yielding multiple writers. The short film stars Edward the Spatula who after being fired from his job, joins in the fight against the evil spoons. This premise allows for some funny content near the beginning, but is barely present for the remainder of the feature. This film's 15-minute running time is absorbed by some odd-ball comedy and a small musical number. Unfortunately not much else lies below it. The plot that is set up doesn't really have time to show. But it's surely follows it plot better than many high-budget Hollywood films. This film is worth watching at least a few times. Take it for what it is, and don't expect a deep story.",
  'positive'),
 ("Cinderella In my opinion greatest love story ever told i loved it as a kid and i love it now a wonderful Disney masterpiece this is 1 of my favorite movies i love Disney. i could rave

In [4]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk import sent_tokenize, word_tokenize, pos_tag
 
 
lemmatizer = WordNetLemmatizer()
 
 
def penn_to_wn(tag):

    #Convert between the PennTreebank tags to simple Wordnet tags
    
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None
 
 

def clean_text(text):
    text = text.replace("<br />", " ")
    #text = text.decode("utf-8")
 
    return text
 
 
def swn_polarity(text):
    
    #Return a sentiment polarity: 0 = negative, 1 = positive
    
 
    sentiment = 0.0
    tokens_count = 0
 
    text = clean_text(text)
 
 
    raw_sentences = sent_tokenize(text)
    for raw_sentence in raw_sentences:
        tagged_sentence = pos_tag(word_tokenize(raw_sentence))
 
        for word, tag in tagged_sentence:
            wn_tag = penn_to_wn(tag)
            if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
                continue
 
            lemma = lemmatizer.lemmatize(word, pos=wn_tag)
            if not lemma:
                continue
 
            synsets = wn.synsets(lemma, pos=wn_tag)
            if not synsets:
                continue
 
            # Take the first sense, the most common
            synset = synsets[0]
            swn_synset = swn.senti_synset(synset.name())
 
            sentiment += swn_synset.pos_score() - swn_synset.neg_score()
            tokens_count += 1
 
    # judgment call ? Default to positive or negative
    if not tokens_count:
        return 0
 
    # sum greater than 0 => positive sentiment
    if sentiment >= 0:
        return 'positive'
 
    # negative sentiment
    return 'negative'


In [5]:
 
print(swn_polarity(test_reviews[0]), test_sentiments[0]) #positive negative
print(swn_polarity(test_reviews[1]), test_sentiments[1]) #positive positive
print(swn_polarity(test_reviews[2]), test_sentiments[2]) #positive negative
print(swn_polarity(test_reviews[3]), test_sentiments[3]) #positive positive
print(swn_polarity(test_reviews[4]), test_sentiments[4]) #negative positive



negative positive
positive positive
positive positive
positive negative
positive positive


In [6]:
from sklearn.metrics import accuracy_score
pred_y = [swn_polarity(text) for text in test_reviews]
 
print(accuracy_score(test_sentiments, pred_y)) # 0.6689333

0.66632


In [63]:
from sklearn.metrics import confusion_matrix
confusion_matrix(test_sentiments, pred_y, labels=["positive", "negative"])

array([[5429, 2081],
       [2885, 4605]])