# NLTK

In [31]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk import sent_tokenize, word_tokenize, pos_tag
from sklearn.metrics import accuracy_score

dataset = pd.read_csv(r'Airline-Sentiment-2-w-AA.csv')
 
conteudoTextual = np.array(dataset['text']) #Selecionado os textos em um array
sentimentos = np.array(dataset['airline_sentiment']) #Sentimento sobre as linhas aéreas

sample_docs = [100, 5817, 7626, 7356, 1008, 7155, 3533, 13010]
sample_data = [(conteudoTextual[index],
                sentimentos[index])
                for index in sample_docs]

lemmatizer = WordNetLemmatizer()
 
def penn_to_wn(tag):

    #Convert between the PennTreebank tags to simple Wordnet tags
    
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None
 
def clean_text(text):
    text = text.replace("<br />", " ")
    #text = text.decode("utf-8")
 
    return text
 
def swn_polarity(text):
    
    #Return a sentiment polarity: 0 = negative, 1 = positive
    
 
    sentiment = 0.0
    tokens_count = 0
 
    text = clean_text(text)
 
 
    raw_sentences = sent_tokenize(text)
    for raw_sentence in raw_sentences:
        tagged_sentence = pos_tag(word_tokenize(raw_sentence))
 
        for word, tag in tagged_sentence:
            wn_tag = penn_to_wn(tag)
            if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
                continue
 
            lemma = lemmatizer.lemmatize(word, pos=wn_tag)
            if not lemma:
                continue
 
            synsets = wn.synsets(lemma, pos=wn_tag)
            if not synsets:
                continue
 
            # Take the first sense, the most common
            synset = synsets[0]
            swn_synset = swn.senti_synset(synset.name())
 
            sentiment += swn_synset.pos_score() - swn_synset.neg_score()
            tokens_count += 1
 
    # judgment call ? Default to positive or negative
    if not tokens_count:
        return 0
 
    # sum greater than 0 => positive sentiment
    if sentiment >= 0:
        return 'positive'
 
    # negative sentiment
    return 'negative'

pred_y = [swn_polarity(text) for text in conteudoTextual]
 
print(accuracy_score(sentimentos, pred_y))

0.4146857923497268


In [35]:
from sklearn.metrics import confusion_matrix
confusion_matrix(sentimentos, pred_y, labels=["positive", "negative"])

array([[1900,  355],
       [4956, 4171]], dtype=int64)