In [5]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib as plt
import graphviz
from pprint import pprint
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import sent_tokenize, word_tokenize, WordPunctTokenizer, TweetTokenizer
from nltk import pos_tag

from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, cross_val_predict
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import MultinomialNB, BernoulliNB

%matplotlib inline

In [4]:
!pip install graphviz


Collecting graphviz
  Using cached https://files.pythonhosted.org/packages/94/cd/7b37f2b658995033879719e1ea4c9f171bf7a14c16b79220bd19f9eda3fe/graphviz-0.13-py2.py3-none-any.whl
Installing collected packages: graphviz
Successfully installed graphviz-0.13


In [6]:
data_path = 'E:/AI projects/Sentiment Analysis/text_emotion.csv'

In [7]:
df_init = pd.read_csv(data_path)

In [8]:
df_init

Unnamed: 0,tweet_id,sentiment,author,content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...
5,1956968477,worry,xxxPEACHESxxx,Re-pinging @ghostridah14: why didn't you go to...
6,1956968487,sadness,ShansBee,"I should be sleep, but im not! thinking about ..."
7,1956968636,worry,mcsleazy,Hmmm. http://www.djhero.com/ is down
8,1956969035,sadness,nic0lepaula,@charviray Charlene my love. I miss you
9,1956969172,sadness,Ingenue_Em,@kelcouch I'm sorry at least it's Friday?


In [9]:
df_init.groupby('sentiment')['content'].count().sort_values()

sentiment
anger          110
boredom        179
enthusiasm     759
empty          827
hate          1323
relief        1526
fun           1776
surprise      2187
love          3842
sadness       5165
happiness     5209
worry         8459
neutral       8638
Name: content, dtype: int64

In [10]:
emotions_set = set(df_init['sentiment'])

In [11]:
emotions_set

{'anger',
 'boredom',
 'empty',
 'enthusiasm',
 'fun',
 'happiness',
 'hate',
 'love',
 'neutral',
 'relief',
 'sadness',
 'surprise',
 'worry'}

In [12]:
keep_emotions_list = [
    'anger',
#     'boredom',
#     'empty',
#     'enthusiasm',
#     'fun',
    'happiness',
#     'hate',
#     'love',
#     'neutral',
#     'relief',
    'sadness',
    'surprise',
#     'worry'
]

In [13]:
df = df_init[df_init['sentiment'].isin(keep_emotions_list)]

In [14]:
df

Unnamed: 0,tweet_id,sentiment,author,content
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...
6,1956968487,sadness,ShansBee,"I should be sleep, but im not! thinking about ..."
8,1956969035,sadness,nic0lepaula,@charviray Charlene my love. I miss you
9,1956969172,sadness,Ingenue_Em,@kelcouch I'm sorry at least it's Friday?
12,1956970047,sadness,Danied32,Ugh! I have to beat this stupid song to get to...
13,1956970424,sadness,Samm_xo,@BrodyJenner if u watch the hills in london u ...
14,1956970860,surprise,okiepeanut93,Got the news
15,1956971077,sadness,Sim_34,The storm is here and the electricity is gone
17,1956971206,sadness,brokenangel1982,So sleepy again and it's not even that late. I...


In [15]:
text_list = df.content.tolist()

In [16]:
text_list

['Layin n bed with a headache  ughhhh...waitin on your call...',
 'Funeral ceremony...gloomy friday...',
 "I should be sleep, but im not! thinking about an old friend who I want. but he's married now. damn, &amp; he wants me 2! scandalous!",
 '@charviray Charlene my love. I miss you',
 "@kelcouch I'm sorry  at least it's Friday?",
 'Ugh! I have to beat this stupid song to get to the next  rude!',
 '@BrodyJenner if u watch the hills in london u will realise what tourture it is because were weeks and weeks late  i just watch itonlinelol',
 'Got the news',
 'The storm is here and the electricity is gone',
 "So sleepy again and it's not even that late. I fail once again.",
 'How are YOU convinced that I have always wanted you? What signals did I give off...damn I think I just lost another friend',
 "so tired and i think i'm definitely going to get an ear infection.  going to bed &quot;early&quot; for once.",
 "@IsaacMascote  i'm sorry people are so rude to you, isaac, they should get some 

In [17]:
labels_list = df.sentiment.tolist()

In [18]:
labels_list

['sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'surprise',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'happiness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'happiness',
 'happiness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'happiness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'surprise',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'surprise',
 'surprise',
 'sadness',
 'surprise',
 'sadness',
 'surprise',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'surprise',
 'sadness',
 'sadness',
 'happiness',
 'happiness',
 'sadness',
 'sadness',
 'sadness',
 'happiness',
 'happiness',
 'sadness',
 'surprise',
 'surprise',
 'surprise',
 'sadness',
 '

In [19]:
emotions_list_plutchik = ['anger', 'anticipation', 'digust', 'fear', 'joy', 'sadness', 'surprise', 'trust']

In [20]:
negations_list = ['never','no','nothing','nowhere','noone','none','not','havent','hasnt','hadnt','cant',
                  'couldnt','shouldnt','wont','wouldnt','dont','doesnt','didnt','isnt','arent','aint',
                 'wasnt']

negations_list.extend(["haven't","hasn't","hadn't","can't","couldn't","shouldn't","won't","wouldn't",
                       "don't","doesn't","didn't","isn't","aren't",
                        "wasn't"])

In [21]:
def handle_negations(tokens):
    new_tokens = []
    prev_neg = False
    for token in tokens:
        current_token = token

        if prev_neg == True:
            current_token += "_NEG"
            prev_neg = False

        if token in negations_list:
            prev_neg = True

        new_tokens.append(current_token)
    return new_tokens

In [22]:
# universal_tags_keep = ['VERB', 'NOUN', 'ADJ']
# VERB (verbs), NOUN (nouns), PRON (pronouns), ADJ (adjectives), ADV (adverbs), ADP (adpositions ), CONJ (conjunctions), DET (determiners), NUM (cardinal numbers), PRT (particles ), . (punctuation) and X (other).

In [23]:
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
def custom_tokenizer(text):
    # Tokenize text
    tokens = tokenizer.tokenize(text)
    
    # Handle negations
    tokens_neg = handle_negations(tokens)
    
    # POS tagging
    tokens_tagged = pos_tag(tokens, tagset='universal')
    
    tokens = [ (tokens_neg[i],tagged) for token in enumerate(tokens)]
    return tokens

In [24]:
def custom_preprocessor(text):
    # Remove URLs
    text = re.sub(r'http\S+|https\S+', '', text)
    return text

In [25]:
vectorizer_options = dict(
#     min_df = 0.1,
#     max_df = 0.7,
#     stop_words = 'english',
    
    lowercase = False,
    tokenizer = custom_tokenizer,
    preprocessor = custom_preprocessor,
    ngram_range = (1, 2),
#     binary = True
)

# vectorizer = CountVectorizer(**vectorizer_options)
vectorizer = TfidfVectorizer(**vectorizer_options)

In [26]:
def create_featurevector(documents):
    tfidf_matrix = vectorizer.fit_transform(documents)
    return tfidf_matrix, vectorizer

In [27]:
train, vectorizer = create_featurevector(text_list)

NameError: name 'i' is not defined

In [28]:
train.shape

NameError: name 'train' is not defined