In [53]:
import nltk

In [54]:
# The punkt module is a pre-trained model that helps you tokenize words and sentences.
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [55]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [56]:
from nltk.corpus import stopwords
stopwords.words('english')[0:10] # Show some stop words

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [57]:
import string
mess = 'Sample message! Notice: it has punctuation.'
# Check characters to see if they are in punctuation
nopunc = [char for char in mess if char not in string.punctuation]
# Join the characters again to form the string.
nopunc = ''.join(nopunc)

In [58]:
# Now just remove any stopwords
clean_mess = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
clean_mess

['Sample', 'message', 'Notice', 'punctuation']

In [59]:
from nltk.corpus import stopwords
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')

In [60]:
def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation]
    # Join the characters again to form the string.
    nopunc = ''.join(nopunc) 
    # Now just remove any stopwords
    return [word for word in nopunc.split() if word.lower() not in set(all_stopwords)]

In [61]:
import pandas as pd
df=pd.read_csv("/content/drive/My Drive/NLP/Whatsapp status emotion prediction/emotion.csv")
df.head(2)

Unnamed: 0,content,sentiment
0,"Sometimes I’m not angry, I’m hurt and there’s ...",angry
1,Not available for busy people☺,angry


In [62]:
# Check to make sure its working
df['content'].head(5).apply(text_process)

0    [Sometimes, I’m, not, angry, I’m, hurt, there’...
1                      [Not, available, busy, people☺]
2    [not, exist, impress, world, exist, live, life...
3    [Everything, getting, expensive, except, peopl...
4                 [phone, screen, brighter, future, 🙁]
Name: content, dtype: object

In [63]:
df['length'] = df['content'].apply(len)
df.head(3)

Unnamed: 0,content,sentiment,length
0,"Sometimes I’m not angry, I’m hurt and there’s ...",angry,63
1,Not available for busy people☺,angry,30
2,I do not exist to impress the world. I exist t...,angry,94


In [64]:
from sklearn.feature_extraction.text import CountVectorizer
# Might take awhile...
bow_transformer = CountVectorizer(analyzer=text_process).fit(df['content'])
# Print total number of vocab words
print(len(bow_transformer.vocabulary_))

3501


In [65]:
message4 = df['content'][3]
print(message4)

Everything is getting expensive except some people, they are getting cheaper.


In [66]:
bow4 = bow_transformer.transform([message4])
print(bow4)
print(bow4.shape)

  (0, 374)	1
  (0, 1580)	1
  (0, 1912)	1
  (0, 1925)	1
  (0, 2088)	2
  (0, 2656)	1
(1, 3501)


In [67]:
messages_bow = bow_transformer.transform(df['content'])

In [68]:
print('Shape of Sparse Matrix: ', messages_bow.shape)
print('Amount of Non-Zero occurences: ', messages_bow.nnz)

Shape of Sparse Matrix:  (2039, 3501)
Amount of Non-Zero occurences:  115236


In [69]:
sparsity = (100.0 * messages_bow.nnz / (messages_bow.shape[0] * messages_bow.shape[1]))
print('sparsity: {}'.format(round(sparsity)))

sparsity: 2


In [70]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer().fit(messages_bow)
tfidf4 = tfidf_transformer.transform(bow4)
print(tfidf4)

  (0, 2656)	0.21772366070494964
  (0, 2088)	0.6670522607225847
  (0, 1925)	0.3878565088772852
  (0, 1912)	0.3878565088772852
  (0, 1580)	0.415724819875675
  (0, 374)	0.18424228672188983


In [71]:
messages_tfidf = tfidf_transformer.transform(messages_bow)
print(messages_tfidf.shape)

(2039, 3501)


In [72]:
from sklearn.naive_bayes import MultinomialNB
detect_model = MultinomialNB().fit(messages_tfidf, df['sentiment'])

In [73]:
print('predicted:', detect_model.predict(tfidf4)[0])
print('expected:', df.sentiment[3])

predicted: angry
expected: angry


In [74]:
all_predictions =detect_model.predict(messages_tfidf)
print(all_predictions)

['angry' 'angry' 'happy' ... 'sad' 'happy' 'sad']


In [75]:
from sklearn.metrics import classification_report
print (classification_report(df['sentiment'], all_predictions))

              precision    recall  f1-score   support

       angry       0.96      0.88      0.92       696
       happy       0.75      0.97      0.85       708
         sad       0.94      0.72      0.81       635

    accuracy                           0.86      2039
   macro avg       0.88      0.86      0.86      2039
weighted avg       0.88      0.86      0.86      2039



In [76]:
from sklearn.model_selection import train_test_split
msg_train, msg_test, label_train, label_test = \
train_test_split(df['content'], df['sentiment'], test_size=0.2)
print(len(msg_train), len(msg_test), len(msg_train) + len(msg_test))

1631 408 2039


In [77]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

In [78]:
pipeline.fit(msg_train,label_train)

Pipeline(memory=None,
         steps=[('bow',
                 CountVectorizer(analyzer=<function text_process at 0x7f8de5f0b158>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('classifier',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [79]:
predictions = pipeline.predict(msg_test)
print(classification_report(predictions,label_test))

              precision    recall  f1-score   support

       angry       0.66      0.85      0.74       118
       happy       0.82      0.59      0.69       201
         sad       0.54      0.67      0.60        89

    accuracy                           0.68       408
   macro avg       0.67      0.70      0.68       408
weighted avg       0.71      0.68      0.68       408



In [80]:
t="mind say move on"

In [81]:
message4 ='mind say move on but heart say hold on'
print(message4)
bow4 = bow_transformer.transform([message4])
tfidf4 = tfidf_transformer.transform(bow4)
print('predicted:', detect_model.predict(tfidf4)[0])


mind say move on but heart say hold on
predicted: happy


In [82]:
message4 ="it was a beautiful"
print(message4)
bow4 = bow_transformer.transform([message4])
tfidf4 = tfidf_transformer.transform(bow4)
print('predicted:', detect_model.predict(tfidf4)[0])


it was a beautiful
predicted: happy


In [83]:
message4 ='she had sad eyes'
print(message4)
bow4 = bow_transformer.transform([message4])
tfidf4 = tfidf_transformer.transform(bow4)
print('predicted:', detect_model.predict(tfidf4)[0])


she had sad eyes
predicted: sad


In [84]:
message4 ="she gave her companion an angry glance"
print(message4)
bow4 = bow_transformer.transform([message4])
tfidf4 = tfidf_transformer.transform(bow4)
print('predicted:', detect_model.predict(tfidf4)[0])


she gave her companion an angry glance
predicted: angry
