In [8]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from textblob import Word
import re
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk import word_tokenize, sent_tokenize,pos_tag, wordpunct_tokenize
from nltk.corpus import stopwords
import string
nltk.download("stopwords")
nltk.download("wordnet")
from textblob import Word


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Brian\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Brian\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Brian\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Brian\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
data = pd.read_csv('text_emotion.csv')

In [10]:
data = data.drop('author', axis=1)

In [11]:
print(data.sentiment.unique())

['empty' 'sadness' 'enthusiasm' 'neutral' 'worry' 'surprise' 'love' 'fun'
 'hate' 'happiness' 'boredom' 'relief' 'anger']


In [12]:
# Dropping rows with other emotion label
data = data.drop(data[data.sentiment == 'boredom'].index)
data = data.drop(data[data.sentiment == 'empty'].index)
data = data.drop(data[data.sentiment == 'fun'].index)
data = data.drop(data[data.sentiment == 'relief'].index)
data = data.drop(data[data.sentiment == 'surprise'].index)
data = data.drop(data[data.sentiment == 'worry'].index)
data = data.drop(data[data.sentiment == 'enthusiasm'].index)
data = data.drop(data[data.sentiment == 'neutral'].index)
data = data.drop(data[data.sentiment == 'love'].index)
data = data.drop(data[data.sentiment == 'anger'].index)

In [13]:
print(data.sentiment.unique())

['sadness' 'hate' 'happiness']


In [45]:
# Making all letters lowercase
data['content'] = data['content'].apply(lambda x: " ".join(x.lower() for x in x.split()))

# Removing Punctuation, Symbols
data['content'] = data['content'].str.replace('[^\w\s]',' ')

# Removing Stop Words using NLTK
stop = stopwords.words('english')
data['content'] = data['content'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

#Lemmatisation
data['content'] = data['content'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

# remove any text starting with User... 
data['content'] = data['content'].map(lambda x: re.sub("\[\[User.*",'',str(x)))
    
# remove IP addresses or user IDs
data['content'] = data['content'].map(lambda x: re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}",'',str(x)))
    
#remove http links in the text
data['content'] = data['content'].map(lambda x: re.sub("(http://.*?\s)|(http://.*)",'',str(x)))

#Correcting Letter Repetitions

def de_repeat(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

data['content'] = data['content'].apply(lambda x: " ".join(de_repeat(x) for x in x.split()))

In [46]:
#Encoding output labels 'sadness' as '1' & 'happiness' as '0'
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(data.sentiment.values)

In [47]:
# Splitting into training and testing data in 90:10 ratio
X_train, X_test, y_train, y_test = train_test_split(data.content.values, y, stratify=y, random_state=42, test_size=0.1, shuffle=True)

In [48]:
#Bag of words
# Extracting features from text files
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
X_test_counts =count_vect.transform(X_test)
print('Shape of Term Frequency Matrix: ',X_train_counts.shape)

Shape of Term Frequency Matrix:  (10527, 8054)


In [49]:
# Code to find the top 10,000 rarest words appearing in the data
freq = pd.Series(' '.join(data['content']).split()).value_counts()[-10000:]

# Removing all those rarely appearing words from the data
freq = list(freq.index)
data['content'] = data['content'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))

In [50]:
# from sklearn.metrics import accuracy_score
# from sklearn.neural_network import MLPClassifier
# from sklearn.naive_bayes import MultinomialNB

**Working with TfidVectorizer**

In [76]:
# Using a pipeline

# Logistic Regression pipeline setup
logreg_pipe = Pipeline([
                     ('tvec', TfidfVectorizer()),
                     ('logreg', LogisticRegression())
                     ])

In [77]:
logreg_pipe.fit(X_train, y_train)



Pipeline(memory=None,
     steps=[('tvec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [78]:
# Setting params for TFIDF Vectorizer gridsearch
tf_params = {
 'tvec__max_features':[100, 2000],
 'tvec__ngram_range': [(1, 1), (1, 2), (2, 2)],
 'tvec__stop_words': [None, 'english'],
 
}

#Logistic Regression params
logreg_params = {
    'tvec__max_features':[100, 2000],
    'tvec__ngram_range': [(1, 1), (1, 2), (2, 2)],
    'tvec__stop_words': [None, 'english'],
    'logreg__C': [1],
    'logreg__solver': ['lbfgs'],
    'logreg__multi_class': ['multinomial'],
    'logreg__max_iter': [1000],
    'logreg__penalty': ['l2']
}


In [79]:
# #Grid Search
from sklearn.model_selection import GridSearchCV
# Setting up GridSearch for Logistic Regression
logreg_gs = GridSearchCV(logreg_pipe, param_grid=logreg_params, cv = 5, verbose = 1, n_jobs = -1)

In [80]:
# Fitting Logistic Regression CV GS
logreg_gs.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   57.2s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  1.5min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('tvec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'tvec__max_features': [100, 2000], 'tvec__ngram_range': [(1, 1), (1, 2), (2, 2)], 'tvec__stop_words': [None, 'english'], 'logreg__C': [1], 'logreg__solver': ['lbfgs'], 'logreg__multi_class': ['multinomial'], 'logreg__max_iter': [1000], 'logreg__penalty': ['l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [72]:
# Scoring training data on Logistic Regression
logreg_gs.score(X_train, y_train)

0.7878787878787878

In [73]:
# Scoring testing data on Logistic Regression
logreg_gs.score(X_test, y_test)

0.7264957264957265

In [58]:
#Below are 8 random statements. The first 4 depict happiness. The last 4 depict sadness

tweets = pd.DataFrame(['I am very unhappy today! The atmosphere looks gloom',
'Things are looking great. It was such a perfect day',
'Success is right around the corner. Lets do this guys',
'Everything is more beautiful when you experience them with a smile!',
'Now this is my worst, okay? But I am gonna get better.',
'I am tired, boss. Tired of being on the road, lonely as a sparrow in the rain. I am tired of all the pain I feel',
'This is quite depressing. I am filled with sorrow',
'I am so excited about tonight I cannot wait to get home',
'His death broke my heart. It was a sad day',
'He makes me so angry sometimes',
'I hate working here',
'I dislike deal with my workmates anymore'])

In [59]:
tweets = tweets[0].apply(lambda x: x.lower())
print(tweets)

0     i am very unhappy today! the atmosphere looks ...
1     things are looking great. it was such a perfec...
2     success is right around the corner. lets do th...
3     everything is more beautiful when you experien...
4     now this is my worst, okay? but i am gonna get...
5     i am tired, boss. tired of being on the road, ...
6     this is quite depressing. i am filled with sorrow
7     i am so excited about tonight i cannot wait to...
8            his death broke my heart. it was a sad day
9                        he makes me so angry sometimes
10                                  i hate working here
11             i dislike deal with my workmates anymore
Name: 0, dtype: object


In [60]:
logreg_tfid_tweet = logreg_gs.best_estimator_.predict(tweets)
print(lbl_enc.inverse_transform(logreg_tfid_tweet))
print(tweets)

['sadness' 'happiness' 'happiness' 'happiness' 'sadness' 'sadness'
 'sadness' 'happiness' 'sadness' 'sadness' 'hate' 'sadness']
0     i am very unhappy today! the atmosphere looks ...
1     things are looking great. it was such a perfec...
2     success is right around the corner. lets do th...
3     everything is more beautiful when you experien...
4     now this is my worst, okay? but i am gonna get...
5     i am tired, boss. tired of being on the road, ...
6     this is quite depressing. i am filled with sorrow
7     i am so excited about tonight i cannot wait to...
8            his death broke my heart. it was a sad day
9                        he makes me so angry sometimes
10                                  i hate working here
11             i dislike deal with my workmates anymore
Name: 0, dtype: object


In [81]:
import pickle
logreg_classifier = open("emotion_logreg.pickle", 'wb')
pickle.dump(logreg_gs, logreg_classifier)
logreg_classifier.close()

In [82]:
data.columns

Index(['tweet_id', 'sentiment', 'content'], dtype='object')