In [1]:
import pandas as pd
import numpy as np
import sklearn

In [2]:
data = pd.read_csv('text_emotion.csv')

In [3]:
data = data.drop('author', axis=1)

In [4]:
data.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [5]:
# TEXT PRE PROCESSING

#lowercase
data['content'] = data['content'].apply(lambda x: " ".join(x.lower() for x in x.split()))

#remove punctuations
data['content'] = data['content'].str.replace('[^\w\s]',' ')

In [6]:
# removing stopwords
from nltk.corpus import stopwords

stop = stopwords.words('english')

data['content'] = data['content'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

In [7]:
data.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,tiffanylue know listenin bad habit earlier sta...
1,1956967666,sadness,layin n bed headache ughhhh waitin call
2,1956967696,sadness,funeral ceremony gloomy friday
3,1956967789,enthusiasm,wants hang friends soon
4,1956968416,neutral,dannycastillo want trade someone houston ticke...


In [8]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

data['content'] = data['content'].apply(lambda x: " ".join(ps.stem(word) for word in x.split()))
#[stemmer.stem(plural) for plural in plurals]

In [9]:
data.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,tiffanylu know listenin bad habit earlier star...
1,1956967666,sadness,layin n bed headach ughhhh waitin call
2,1956967696,sadness,funer ceremoni gloomi friday
3,1956967789,enthusiasm,want hang friend soon
4,1956968416,neutral,dannycastillo want trade someon houston ticket...


In [10]:
#REMOVING LEAST OCCURENCE OF 10000 words

freq = pd.Series(' '.join(data['content']).split()).value_counts()[-10000:]
freq = list(freq.index)
data['content'] = data['content'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))

In [11]:
#Encoding output labels 'sadness' as '1' & 'happiness' as '0'
from sklearn import preprocessing
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(data.sentiment.values)
# Splitting into training and testing data in 90:10 ratio
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(data.content.values, y, stratify=y, random_state=42, test_size=0.1, shuffle=True)

In [12]:
# Extracting Count Vectors Parameters
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(analyzer='word')
count_vect.fit(data['content'])
X_train_count =  count_vect.transform(X_train)
X_val_count =  count_vect.transform(X_val)

In [13]:
from sklearn.metrics import accuracy_score

In [14]:
# Model 1: Multinomial Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train_count, y_train)
y_pred = nb.predict(X_val_count)
print('naive bayes count vectors accuracy %s' % accuracy_score(y_pred, y_val))

print("----------------------------------------------")
# Model 2: Linear SVM
from sklearn.linear_model import SGDClassifier
lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=None)
lsvm.fit(X_train_count, y_train)
y_pred = lsvm.predict(X_val_count)
print('lsvm using count vectors accuracy %s' % accuracy_score(y_pred, y_val))

print("----------------------------------------------")
# Model 3: Logistic Regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1)
logreg.fit(X_train_count, y_train)
y_pred = logreg.predict(X_val_count)
print('log reg count vectors accuracy %s' % accuracy_score(y_pred, y_val))

naive bayes count vectors accuracy 0.31775
----------------------------------------------
lsvm using count vectors accuracy 0.32625
----------------------------------------------




log reg count vectors accuracy 0.3525
