## **Emotion Detection in Twitter Data using NLP Techniques**

In [1]:
# DATA HANDLING
import numpy as np
import pandas as pd

In [2]:
data=pd.read_csv('/content/tweet_emotions.csv')

In [4]:
data.head(10)

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...
5,1956968477,worry,Re-pinging @ghostridah14: why didn't you go to...
6,1956968487,sadness,"I should be sleep, but im not! thinking about ..."
7,1956968636,worry,Hmmm. http://www.djhero.com/ is down
8,1956969035,sadness,@charviray Charlene my love. I miss you
9,1956969172,sadness,@kelcouch I'm sorry at least it's Friday?


In [6]:
data.shape

(40000, 3)

In [7]:
data['sentiment'].value_counts()

neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
empty          827
enthusiasm     759
boredom        179
anger          110
Name: sentiment, dtype: int64

There are 13 different emotions; therefore there will 13 classes  in the target column

In [8]:
data.drop('tweet_id',axis=1,inplace=True) # dropping since not important in the process

In [9]:
data.head()

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...


In [10]:
data.isna().sum() # finding NaN

sentiment    0
content      0
dtype: int64

Encoding the target column

In [11]:
from sklearn.preprocessing import LabelEncoder # Encoding the target column
le=LabelEncoder()
data['sentiment']=le.fit_transform(data['sentiment'])


In [12]:
data.head()

Unnamed: 0,sentiment,content
0,2,@tiffanylue i know i was listenin to bad habi...
1,10,Layin n bed with a headache ughhhh...waitin o...
2,10,Funeral ceremony...gloomy friday...
3,3,wants to hang out with friends SOON!
4,8,@dannycastillo We want to trade with someone w...


In [13]:
data['sentiment'].value_counts()

8     8638
12    8459
5     5209
10    5165
7     3842
11    2187
4     1776
9     1526
6     1323
2      827
3      759
1      179
0      110
Name: sentiment, dtype: int64

     
worry=12        
surprise =11     
sadness=10       
relief=9
neutral=8
love=7      
hate=6
happiness=5        
fun=4      
enthusiasm=3  
empty=2   
boredom=1        
anger= 0       


PRE-PROCESSING

In [34]:
# removing punctuations
import string
def remove_punct(text):
  punc_free = ''.join([i for i in text if i not in string.punctuation])
  return punc_free

In [35]:
# tokenization
import nltk
nltk.download('punkt')
def tokenization(text):
  words = nltk.word_tokenize(text)
  return words

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [36]:
# removing stopwords
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
def remove_stopwords(text):
  output = [i for i in text if i not in stopwords]
  return output

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [37]:
# lemmatization
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
wordnet_lemm = WordNetLemmatizer()
def lemm(text):
  lemm_text = [wordnet_lemm.lemmatize(word) for word in text]
  return lemm_text

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [38]:
def preprocess(data_col):
  corpus = []
  for item in data_col:
    new_item = remove_punct(item)
    new_item = new_item.lower()
    new_item = tokenization(new_item)
    new_tem = remove_stopwords(new_item)
    new_item = lemm(new_item)
    corpus.append(' '.join(str(x) for x in new_item))
  return corpus

In [39]:
corpus = preprocess(data['content']) # function call for preprocessing

In [52]:
corpus[1:10]

['layin n bed with a headache ughhhhwaitin on your call',
 'funeral ceremonygloomy friday',
 'want to hang out with friend soon',
 'dannycastillo we want to trade with someone who ha houston ticket but no one will',
 'repinging ghostridah14 why didnt you go to prom bc my bf didnt like my friend',
 'i should be sleep but im not thinking about an old friend who i want but he married now damn amp he want me 2 scandalous',
 'hmmm httpwwwdjherocom is down',
 'charviray charlene my love i miss you',
 'kelcouch im sorry at least it friday']

Feature extraction using Countvectorizer

In [68]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(1,2))
train_data = cv.fit_transform(corpus)

Model building and evaluation:
1. Random forest classifier

In [69]:
x = train_data
y = data['sentiment']

In [70]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25)

In [71]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(x_train,y_train)

In [73]:
from sklearn.metrics import accuracy_score
y_pred1=clf.predict(x_test)
accuracy_score(y_test,y_pred1)

0.33

2. XGBOOST Classifier

In [76]:
from xgboost import XGBClassifier
xgbclf = XGBClassifier()
xgbclf.fit(x_train,y_train)

In [77]:
y_pred2=xgbclf.predict(x_test)
accuracy_score(y_test,y_pred2)

0.3532

TESTING THE BETTER MODEL- XGB USING NEW DATA

In [80]:
def find_sentiment(input):
  input = cv.transform(preprocess(input))
  ot_pred = xgbclf.predict(input)
  out1= le.inverse_transform(ot_pred)
  print(out1)

In [81]:
input=['I am really sad and gloomy']
find_sentiment(input)

['sadness']


In [82]:
input=['I am happy and positive']
find_sentiment(input)

['happiness']


**** ACCURACY CAN BE INCREASED BY USING RNN

In [83]:
from keras.preprocessing import text
tokenizer=text.Tokenizer()

In [85]:
tokenizer.fit_on_texts(list(data['content'])) #tokenizing
tokenized_text=tokenizer.texts_to_sequences(data['content'])
from keras.utils import pad_sequences #padding
X=pad_sequences(tokenized_text,maxlen=100)
Y=data['sentiment']
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2)

In [86]:
from keras.models import Sequential
from keras.layers import Dense, LSTM,Embedding,Dropout

In [87]:
model=Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1,output_dim=128,input_length=100))

In [88]:
#using softmax and categorical_crossentropy since multiclass classification
model.add(LSTM(100))
model.add(Dropout(0.5))
model.add(Dense(50,activation='relu'))
model.add(Dense(13,activation='softmax'))

In [89]:
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics='accuracy')
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 128)          6271744   
                                                                 
 lstm_1 (LSTM)               (None, 100)               91600     
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 dense_2 (Dense)             (None, 50)                5050      
                                                                 
 dense_3 (Dense)             (None, 13)                663       
                                                                 
Total params: 6369057 (24.30 MB)
Trainable params: 6369057 (24.30 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [90]:
history=model.fit(X_train,Y_train,epochs=10,validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
 78/900 [=>............................] - ETA: 2:41 - loss: 0.3681 - accuracy: 0.8906

KeyboardInterrupt: 

In [None]:
y_pred1=model.predict(X_test) # model prediction

In [None]:
y_preds = np.argmax(y_pred1,axis=1)
out_response = le.inverse_transform(y_preds) #decoding the prediction response
accuracy_score(y_preds,Y_test)

In [None]:
test_text='I hate my life'
text1=tokenizer.texts_to_sequences([test_text])
text1=pad_sequences(text1,maxlen=100)
output=model.predict(text1)
output=np.argmax(output,axis=1)

In [None]:
out1= le.inverse_transform(output)
print('Response :',out1)