In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [10]:
data = pd.read_csv('text_emotion.csv')

In [11]:
data.head()

Unnamed: 0,tweet_id,sentiment,author,content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...


In [12]:
data = data.drop('author', axis=1)

In [13]:
data.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [14]:
data.shape

(40000, 3)

In [15]:
data.describe()

Unnamed: 0,tweet_id
count,40000.0
mean,1845184000.0
std,118857900.0
min,1693956000.0
25%,1751431000.0
50%,1855443000.0
75%,1962781000.0
max,1966441000.0


In [16]:
data.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [17]:
data['content'] = data['content'].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [18]:
data.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habit...
1,1956967666,sadness,layin n bed with a headache ughhhh...waitin on...
2,1956967696,sadness,funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends soon!
4,1956968416,neutral,@dannycastillo we want to trade with someone w...


In [19]:
data['content'] = data['content'].str.replace('[^\w\s]',' ')

In [20]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
data['content'] = data['content'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

In [21]:
data.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,tiffanylue know listenin bad habit earlier sta...
1,1956967666,sadness,layin n bed headache ughhhh waitin call
2,1956967696,sadness,funeral ceremony gloomy friday
3,1956967789,enthusiasm,wants hang friends soon
4,1956968416,neutral,dannycastillo want trade someone houston ticke...


In [22]:
from textblob import Word
data['content'] = data['content'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

In [23]:
#Correcting Letter Repetitions
import re
def de_repeat(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)
#%%
data['content'] = data['content'].apply(lambda x: " ".join(de_repeat(x) for x in x.split()))

In [24]:
data.tail()

Unnamed: 0,tweet_id,sentiment,content
39995,1753918954,neutral,johnlloydtaylor
39996,1753919001,love,happy mother day love
39997,1753919005,love,happy mother day mommy woman man long momma so...
39998,1753919043,happiness,niariley wassup beautiful follow peep new hit ...
39999,1753919049,love,mopedronin bullet train tokyo gf visiting japa...


In [25]:
freq = pd.Series(' '.join(data['content']).split()).value_counts()[-10000:]

In [26]:
type(freq)

pandas.core.series.Series

In [27]:
freq = list(freq.index)
data['content'] = data['content'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))

In [28]:
#Encoding output labels 'sadness' as '1' & 'happiness' as '0'
from sklearn import preprocessing
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(data.sentiment.values)
# Splitting into training and testing data in 90:10 ratio
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(data.content.values, y, stratify=y, random_state=42, test_size=0.1, shuffle=True)

In [29]:
#TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=1000, analyzer='word',ngram_range=(1,3))
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.fit_transform(X_val)

In [30]:
# Extracting Count Vectors Parameters
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(analyzer='word')
count_vect.fit(data['content'])
X_train_count =  count_vect.transform(X_train)
X_val_count =  count_vect.transform(X_val)

In [31]:
from sklearn.metrics import accuracy_score

In [32]:
#MODEL 1: Multinomial NB

from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

nb.fit(X_train_tfidf, y_train)

y_nb_pred = nb.predict(X_val_tfidf)
print('naive bayes tfidf accuracy %s' % accuracy_score(y_nb_pred, y_val))

naive bayes tfidf accuracy 0.217


In [33]:
#MODEL 2: Linear SVM
from sklearn.linear_model import SGDClassifier
svc = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=None)

svc.fit(X_train_tfidf, y_train)

y_svc_pred = svc.predict(X_val_tfidf)
print('SVC tfidf accuracy %s' % accuracy_score(y_svc_pred, y_val))

SVC tfidf accuracy 0.1905


In [34]:
# Model 3: logistic regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1)
logreg.fit(X_train_tfidf, y_train)
y_log_pred = logreg.predict(X_val_tfidf)
print('log reg tfidf accuracy %s' % accuracy_score(y_log_pred, y_val))



log reg tfidf accuracy 0.2295


In [35]:
# Model 4: Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=500)
rf.fit(X_train_tfidf, y_train)
y_rf_pred = rf.predict(X_val_tfidf)
print('random forest tfidf accuracy %s' % accuracy_score(y_rf_pred, y_val))

random forest tfidf accuracy 0.21475


In [37]:
# Model 1: Multinomial Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train_count, y_train)
y_pred = nb.predict(X_val_count)
print('naive bayes count vectors accuracy %s' % accuracy_score(y_pred, y_val))

print("----------------------------------------------")
# Model 2: Linear SVM
from sklearn.linear_model import SGDClassifier
lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=None)
lsvm.fit(X_train_count, y_train)
y_pred = lsvm.predict(X_val_count)
print('lsvm using count vectors accuracy %s' % accuracy_score(y_pred, y_val))

print("----------------------------------------------")
# Model 3: Logistic Regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1)
logreg.fit(X_train_count, y_train)
y_pred = logreg.predict(X_val_count)
print('log reg count vectors accuracy %s' % accuracy_score(y_pred, y_val))

naive bayes count vectors accuracy 0.33
----------------------------------------------
lsvm using count vectors accuracy 0.33525
----------------------------------------------




log reg count vectors accuracy 0.345


In [38]:
data['sentiment'].value_counts()

neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
empty          827
enthusiasm     759
boredom        179
anger          110
Name: sentiment, dtype: int64

## USING NEURAL NETWORKS

In [3]:
import keras

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [5]:
from keras.models import Sequential
model = Sequential()

In [6]:
from keras.layers import Activation, Dense

In [39]:
model.add(Dense(64, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(13, activation='softmax'))

In [43]:
model.compile(loss=keras.losses.CategoricalCrossentropy,
             optimizer=keras.optimizers.Adam(),
             metrics=['accuracy'])

In [44]:
model.fit(X_train_count, y_train, epochs=5, verbose=1)

ValueError: Please provide as model inputs either a single array or a list of arrays. You passed: x=  (0, 2503)	1
  (0, 3405)	1
  (0, 3949)	1
  (0, 9969)	1
  (0, 13316)	2
  (0, 19631)	2
  (0, 24578)	1
  (0, 30862)	1
  (0, 33900)	1
  (0, 34145)	1
  (1, 3703)	1
  (1, 10815)	1
  (1, 18357)	1
  (1, 25005)	2
  (1, 33264)	1
  (2, 3150)	1
  (2, 4308)	1
  (2, 5200)	1
  (2, 7359)	1
  (2, 8406)	1
  (2, 9909)	1
  (2, 11046)	1
  (2, 13490)	1
  (2, 18500)	1
  (2, 19892)	1
  :	:
  (35996, 28628)	1
  (35996, 33050)	1
  (35996, 34084)	1
  (35997, 7322)	1
  (35997, 7370)	1
  (35997, 11223)	1
  (35997, 12811)	1
  (35997, 21876)	1
  (35997, 22557)	1
  (35997, 24852)	1
  (35997, 33278)	1
  (35997, 33849)	1
  (35998, 12087)	1
  (35998, 25401)	1
  (35998, 30889)	1
  (35998, 33268)	1
  (35999, 1795)	1
  (35999, 5087)	1
  (35999, 12031)	1
  (35999, 13142)	1
  (35999, 21013)	1
  (35999, 27181)	1
  (35999, 31481)	1
  (35999, 32130)	1
  (35999, 33278)	1