In [44]:
import pandas as pd
import numpy as np

#Leitura do dataset - 50.000 textos rotulados em positivo e negativo

dataset = pd.read_csv(r'movie_reviews.csv')


#Visualização dos títulos das colunas
print(dataset.head())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


Divisão do dataset em treino e teste

In [30]:
train_data = dataset[:35000] #35.000 para treino
test_data = dataset[35000:] #15.00 para teste

Separa as colunas

In [31]:
test_reviews = np.array(test_data['review'])
test_sentiments = np.array(test_data['sentiment'])

Uma amostragem para facilitar o estudo...

In [33]:
sample_docs = [100, 5817, 7626, 7356, 1008, 7155, 3533, 13010]
sample_data = [(test_reviews[index],
                test_sentiments[index])
                  for index in sample_docs]


sample_data        


 

[("Worst movie, (with the best reviews given it) I've ever seen. Over the top dialog, acting, and direction. more slasher flick than thriller.With all the great reviews this movie got I'm appalled that it turned out so silly. shame on you martin scorsese",
  'negative'),
 ('I hope this group of film-makers never re-unites.', 'negative'),
 ('no comment - stupid movie, acting average or worse... screenplay - no sense at all... SKIP IT!',
  'negative'),
 ('Add this little gem to your list of holiday regulars. It is<br /><br />sweet, funny, and endearing',
  'positive'),
 ('a mesmerizing film that certainly keeps your attention... Ben Daniels is fascinating (and courageous) to watch.',
  'positive'),
 ('This movie is perfect for all the romantics in the world. John Ritter has never been better and has the best line in the movie! "Sam" hits close to home, is lovely to look at and so much fun to play along with. Ben Gazzara was an excellent cast and easy to fall in love with. I\'m sure I\'ve

In [57]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk import sent_tokenize, word_tokenize, pos_tag
 
 
lemmatizer = WordNetLemmatizer()
 
 
def penn_to_wn(tag):

    #Convert between the PennTreebank tags to simple Wordnet tags
    
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None
 
 

def clean_text(text):
    text = text.replace("<br />", " ")
    #text = text.decode("utf-8")
 
    return text
 
 
def swn_polarity(text):
    
    #Return a sentiment polarity: 0 = negative, 1 = positive
    
 
    sentiment = 0.0
    tokens_count = 0
 
    text = clean_text(text)
 
 
    raw_sentences = sent_tokenize(text)
    for raw_sentence in raw_sentences:
        tagged_sentence = pos_tag(word_tokenize(raw_sentence))
 
        for word, tag in tagged_sentence:
            wn_tag = penn_to_wn(tag)
            if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
                continue
 
            lemma = lemmatizer.lemmatize(word, pos=wn_tag)
            if not lemma:
                continue
 
            synsets = wn.synsets(lemma, pos=wn_tag)
            if not synsets:
                continue
 
            # Take the first sense, the most common
            synset = synsets[0]
            swn_synset = swn.senti_synset(synset.name())
 
            sentiment += swn_synset.pos_score() - swn_synset.neg_score()
            tokens_count += 1
 
    # judgment call ? Default to positive or negative
    if not tokens_count:
        return 0
 
    # sum greater than 0 => positive sentiment
    if sentiment >= 0:
        return 'positive'
 
    # negative sentiment
    return 'negative'


In [58]:
 
print(swn_polarity(test_reviews[0]), test_sentiments[0]) #positive negative
print(swn_polarity(test_reviews[1]), test_sentiments[1]) #positive positive
print(swn_polarity(test_reviews[2]), test_sentiments[2]) #positive negative
print(swn_polarity(test_reviews[3]), test_sentiments[3]) #positive positive
print(swn_polarity(test_reviews[4]), test_sentiments[4]) #negative positive



positive negative
positive positive
positive negative
positive positive
negative positive


In [62]:
from sklearn.metrics import accuracy_score
pred_y = [swn_polarity(text) for text in test_reviews]
 
print(accuracy_score(test_sentiments, pred_y)) # 0.6689333

0.668933333333


In [63]:
from sklearn.metrics import confusion_matrix
confusion_matrix(test_sentiments, pred_y, labels=["positive", "negative"])

array([[5429, 2081],
       [2885, 4605]])