In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [2]:
emotions = pd.read_csv('../Data/text_emotion.csv')
emotions_copy = emotions.copy()
emotions.head()

Unnamed: 0,tweet_id,sentiment,author,content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...


In [3]:
# auxiliar function to remove a pattern defined by a regular expression 
def remove_by_regex(tweet, regexp):
        return re.sub(regexp, '', tweet)

# 3 specific cleaning functions to remove numbers, url's and special characters
def remove_numbers(tweet):
    return remove_by_regex(tweet, re.compile(r"[1234567890]"))

def remove_url(tweet):
    return remove_by_regex(tweet, re.compile(r"http.?://[^\s]+[\s]?"))

def remove_special_char(tweet):
    return re.sub(r"[^a-zA-Z0-9 ]", " ", tweet) #add space placeholder

# general cleaning function to do it all at once
def clean_up(tweet):
    tweet = remove_numbers(tweet)
    tweet = remove_url(tweet)
    tweet = remove_special_char(tweet)
    return tweet.lower().strip()

In [4]:
emotions["content"] = emotions["content"].apply(clean_up)

In [5]:
stemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()

In [6]:
def stem(x):
    output = []
    for w in x:
        output.append(stemmer.stem(w))
    return output

emotions["content"] = emotions["content"].apply((stem))

In [7]:
def lemm(x):
    output = []
    for w in x:
        output.append(lemmatizer.lemmatize(w))
    return output
emotions["content"] = emotions["content"].apply((lemm))

In [8]:
stopwords_list = stopwords.words("english")

In [9]:
def stopword(x):
    output = []
    for w in x:
        if w not in stopwords_list:
            output.append(w)
    return output

emotions["content"] = emotions["content"].apply((stopword))

In [10]:
all_words = []

for index, value in emotions["content"].iteritems():
    all_words += value

In [11]:
num_features = 5000
top_features=[x[0] for x in nltk.FreqDist(all_words).most_common(num_features)]

In [12]:
def build_features(words):
    features = {}
    for w in top_features:
        features[w] = (w in words)
    return features

In [13]:
featuresets = []

for index, row in emotions.iterrows():
    featuresets.append((build_features(row["content"])))

In [18]:
training, test = train_test_split(featuresets, test_size=0.2)
classifier= nltk.NaiveBayesClassifier.train(training)
classifier.show_most_informative_features()
print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, test))*100.00)

ValueError: too many values to unpack (expected 2)

In [None]:
negative_lst = emotions.loc[emotions['sentiment']] == ['worry', 'sadness', 'hate', 'empty', 'boredom', 'anger']
negative_lst