In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/cgrandhi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/cgrandhi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
data = pd.read_csv('text_emotion.csv')

In [7]:
data.head(2)

Unnamed: 0,tweet_id,sentiment,author,content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...


#### Explore data

In [8]:
data = data.drop('author',axis=1)

In [9]:
#the total emotions
print(set(data.sentiment))
print(len(set(data.sentiment)))

{'boredom', 'fun', 'relief', 'neutral', 'happiness', 'anger', 'worry', 'sadness', 'surprise', 'love', 'hate', 'enthusiasm', 'empty'}
13


In [10]:
#dropping some of the emotions
data = data.drop(data[data.sentiment == 'empty'].index)
data = data.drop(data[data.sentiment == 'boredom'].index)
data = data.drop(data[data.sentiment == 'fun'].index)
data = data.drop(data[data.sentiment == 'worry'].index)
data = data.drop(data[data.sentiment == 'relief'].index)
data = data.drop(data[data.sentiment == 'enthusiasm'].index)
data = data.drop(data[data.sentiment == 'surprise'].index)
data = data.drop(data[data.sentiment == 'neutral'].index)
data = data.drop(data[data.sentiment == 'hate'].index)
data = data.drop(data[data.sentiment == 'love'].index)
data = data.drop(data[data.sentiment == 'anger'].index)
print(len(set(data.sentiment)))

2


In [11]:
data.head(5)

Unnamed: 0,tweet_id,sentiment,content
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
6,1956968487,sadness,"I should be sleep, but im not! thinking about ..."
8,1956969035,sadness,@charviray Charlene my love. I miss you
9,1956969172,sadness,@kelcouch I'm sorry at least it's Friday?


#### pre process the data

In [12]:
#make all lower case
data['content'] = data['content'].apply(lambda x: "".join(x.lower() for x in x.split()))

#remve punctuations
data['content'] = data['content'].str.replace('[^\w\s]',' ')

#remove stop words
from nltk.corpus import stopwords
stop = stopwords.words('english')
data['content'] = data['content'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

In [13]:
#lemmatisation - convert the words to root form

from textblob import Word

data['content'] = data['content'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

#correcting letter repetitions
import re
def de_repeat(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

In [14]:
#find unique words in the data- top 10,000 words and delete the rest
freq = pd.Series(' '.join(data['content']).split()).value_counts()[-10000:]

freq = list(freq.index)
data['content'] = data['content'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))

In [15]:
data.head(5)

Unnamed: 0,tweet_id,sentiment,content
1,1956967666,sadness,layinnbedwithaheadacheughhhh waitinonyourcall
2,1956967696,sadness,gloomyfriday
6,1956968487,sadness,ishouldbesleep butimnot buthe damn amp hewantsme2
8,1956969035,sadness,charviraycharlenemylove imissyou
9,1956969172,sadness,kelcouchi sfriday


#### Feature Extraction

In [16]:
#one hot encoding for the labels 
from sklearn import preprocessing
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(data.sentiment.values)

In [17]:
print(y, data.sentiment.values)

[1 1 1 ... 0 0 0] ['sadness' 'sadness' 'sadness' ... 'happiness' 'happiness' 'happiness']


In [19]:
#split into training and test data 
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(data.content.values, y, random_state=42, test_size=0.3, shuffle=True)

In [20]:
#extracting tf-idf parameters
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=1000, analyzer='word',ngram_range=(1,3))

X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.fit_transform(X_val)

In [21]:
#transform the words into array to get the number of times a word appears
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(analyzer='word')
count_vect.fit(data['content'])
X_train_count =  count_vect.transform(X_train)
X_val_count =  count_vect.transform(X_val)

#### Train the model

In [22]:
## Train the model
from sklearn.metrics import accuracy_score
from sklearn.linear_model import SGDClassifier 

lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=None)
lsvm.fit(X_train_count, y_train)
y_pred = lsvm.predict(X_val_count)
print('accuracy %s' % accuracy_score(y_pred, y_val))

accuracy 0.5929971088981689


#### Prediction

In [24]:
tweets = pd.DataFrame(['I am very happy today! The atmosphere looks cheerful',
'Things are looking great. It was such a good day',
'Success is right around the corner. Lets celebrate this victory',
'Everything is more beautiful when you experience them with a smile!',
'Now this is my worst, okay? But I am gonna get better.',
'I am tired, boss. Tired of being on the road, lonely as a sparrow in the rain. I am tired of all the pain I feel',
'This is quite depressing. I am filled with sorrow',
'His death broke my heart. It was a sad day'])

In [25]:
text = 'This is quite depressing. I am filled with sorrow'
tweets = pd.DataFrame([text])

In [26]:
# Doing some preprocessing on these tweets as done before
tweets[0] = tweets[0].str.replace('[^\w\s]',' ')
from nltk.corpus import stopwords
stop = stopwords.words('english')
tweets[0] = tweets[0].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
from textblob import Word
tweets[0] = tweets[0].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
# Extracting Count Vectors feature from our tweets
tweet_count = count_vect.transform(tweets[0])
#Predicting the emotion of the tweet using our already trained linear SVM
tweet_pred = lsvm.predict(tweet_count)
print(tweet_pred)


[0]


In [28]:
if (tweet_pred == [0]):
    emotion = "sadness"
else:
    emotion = "happiness"

In [29]:
#save the tweet for the tacotron model to convert it to speech
import pickle
file_name = "tweet"
with open(file_name,'wb') as my_file_obj:
    pickle.dump(text, my_file_obj)

In [30]:
file_name = "emotion_detected"
with open(file_name,'wb') as my_file_obj:
    pickle.dump(emotion, my_file_obj)