In [19]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn import preprocessing

In [20]:
# import dataset to train models
emotions = pd.read_csv('../Data/text_emotion.csv')

# create a security copy
emotions_copy = emotions.copy()

# display the first rows to verify the dataframe status
emotions.head()

Unnamed: 0,tweet_id,sentiment,author,content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...


In [21]:
# assign each sentiment to a numeric value
vals_to_replace = {'anger': 1, 'worry': 2, 'love': 12, 'hate': 0, 'sadness': 3, 'empty': 4, 'boredom': 5, 'neutral': 6,
                   'surprise': 7, 'relief': 8, 'fun': 9, 'enthusiasm': 10, 'happiness': 11}
emotions['sent_num'] = emotions.sentiment.map(vals_to_replace)
emotions['sent_num'].value_counts()

6     8638
2     8459
11    5209
3     5165
12    3842
7     2187
9     1776
8     1526
0     1323
4      827
10     759
5      179
1      110
Name: sent_num, dtype: int64

In [22]:
def stopword_del(sentence):
    stopwords_list = stopwords.words("english")
    word_tokens_test = word_tokenize(sentence.lower())
    tokens_without_sw = [word for word in word_tokens_test if not word in stopwords_list]
    str1 = ' '.join(tokens_without_sw)
    return str1

In [23]:
# auxiliar function to remove a pattern defined by a regular expression 
def remove_by_regex(tweet, regexp):
        return re.sub(regexp, '', tweet)

# 3 specific cleaning functions to remove numbers, url's and special characters
def remove_numbers(tweet):
    return remove_by_regex(tweet, re.compile(r"[1234567890]"))

def remove_url(tweet):
    return remove_by_regex(tweet, re.compile(r"http.?://[^\s]+[\s]?"))

def remove_special_char(tweet):
    return re.sub(r"[^a-zA-Z0-9 ]", "", tweet) #add space placeholder

# general cleaning function to do it all at once
def clean_up(tweet):
    tweet = remove_numbers(tweet)
    tweet = remove_url(tweet)
    tweet = remove_special_char(tweet)
    return tweet.lower().strip()

In [24]:
# remove stopwords
emotions['content'] = emotions['content'].apply(stopword_del)

#apply previously defined functions all at once
emotions["content"] = emotions["content"].apply(clean_up)

#stem the words in the sentences and delete too-large-whitespaces
stemmer = SnowballStemmer("english")
emotions["content_stemmed"] = emotions["content"].apply(stemmer.stem)
emotions["content_stemmed"] = [' '.join(x.split()) for x in emotions["content"]]

#convert the words in sentences to a new dataframe of vectors
vectorizer = TfidfVectorizer()
content_vect = vectorizer.fit_transform(emotions.content_stemmed)
vector_df = pd.DataFrame.sparse.from_spmatrix(content_vect.tocoo(), columns = vectorizer.get_feature_names())
vector_df.head()

Unnamed: 0,aa,aaa,aaaa,aaaaa,aaaaaaaa,aaaaaaaaaaa,aaaaaaaaaahhhhhhhh,aaaaaaaaaamazing,aaaaaaaafternoon,aaaaaaaahhhhhhhh,...,zyrtec,zzerbe,zzwhitejd,zzybug,zzz,zzzz,zzzzy,zzzzz,zzzzzzz,zzzzzzzzzzzzzzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
vector_df.unique()

TypeError: 'Series' object is not callable

In [7]:
#add label to the vectorized dataframe
vector_df['sent_num'] = emotions['sent_num']
vector_df.head()

Unnamed: 0,aa,aaa,aaaa,aaaaa,aaaaaaaa,aaaaaaaaaaa,aaaaaaaaaahhhhhhhh,aaaaaaaaaamazing,aaaaaaaafternoon,aaaaaaaahhhhhhhh,...,zzerbe,zzwhitejd,zzybug,zzz,zzzz,zzzzy,zzzzz,zzzzzzz,zzzzzzzzzzzzzzz,sent_num
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6


In [8]:
# create train and test sets to apply in models
train_set, test_set = train_test_split(vector_df, test_size=0.3)
train_X = train_set.drop(columns='sent_num')
train_y = train_set['sent_num']
test_X = test_set.drop(columns='sent_num')
test_y = test_set['sent_num']

# Models predictions

### Logistic Regression

In [9]:
model_LogR = LogisticRegression(max_iter = 1000, multi_class="multinomial")
model_LogR.fit(train_X, train_y)

LogisticRegression(max_iter=1000, multi_class='multinomial')

In [10]:
model_LogR.score(train_X, train_y)

0.5717142857142857

### SuperVectorMachine

In [13]:
from sklearn.svm import LinearSVC
LSVC = LinearSVC(multi_class='crammer_singer')
LSVC.fit(train_X, train_y)



LinearSVC(multi_class='crammer_singer')

In [14]:
LSVC.score(train_X, train_y)

0.8930357142857143

In [15]:
LSVC.score(test_X, test_y)

0.30833333333333335

### Random Forest

In [16]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(max_depth=200)
rfc.fit(train_X, train_y)
predictions_rfc = rfc.predict(train_X)
accuracy_score(train_y, predictions_rfc)

0.8426428571428571

In [17]:
predictions_rfc_test = rfc.predict(test_X)
accuracy_score(test_y, predictions_rfc_test)

0.3353333333333333

In [None]:
filename = 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))