In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pickle

In [2]:
# import dataset to train models
train = pd.read_csv('../Data/train.txt', sep=';', names=['content', 'sentiment'])

test = pd.read_csv('../Data/test.txt', sep=';', names=['content', 'sentiment'])

In [3]:
data = pd.read_csv('../Data/val.txt', sep=';', names=['content', 'sentiment'])
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18000 entries, 0 to 17999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   content    18000 non-null  object
 1   sentiment  18000 non-null  object
dtypes: object(2)
memory usage: 140.7+ KB


In [4]:
# assign each sentiment to a numeric value
vals_to_replace = {'sadness': 1, 'love': 5, 'anger': 0, 'joy': 4, 'fear': 2, 'surprise': 3}
data['sent_num'] = data.sentiment.map(vals_to_replace)

In [5]:
def stopword_del(sentence):
    stopwords_list = stopwords.words("english")
    word_tokens_test = word_tokenize(sentence.lower())
    tokens_without_sw = [word for word in word_tokens_test if not word in stopwords_list]
    str1 = ' '.join(tokens_without_sw)
    return str1

In [6]:
# auxiliar function to remove a pattern defined by a regular expression 
def remove_by_regex(tweet, regexp):
        return re.sub(regexp, '', tweet)

# 3 specific cleaning functions to remove numbers, url's and special characters
def remove_numbers(tweet):
    return remove_by_regex(tweet, re.compile(r"[1234567890]"))

def remove_url(tweet):
    return remove_by_regex(tweet, re.compile(r"http.?://[^\s]+[\s]?"))

def remove_special_char(tweet):
    return re.sub(r"[^a-zA-Z0-9 ]", "", tweet) #add space placeholder

# general cleaning function to do it all at once
def clean_up(tweet):
    tweet = remove_numbers(tweet)
    tweet = remove_url(tweet)
    tweet = remove_special_char(tweet)
    return tweet.lower().strip()

In [7]:
# remove stopwords
data['content'] = data['content'].apply(stopword_del)

#apply previously defined functions all at once
data["content"] = data["content"].apply(clean_up)

#stem the words in the sentences and delete too-large-whitespaces
stemmer = SnowballStemmer("english")
data["content_stemmed"] = data["content"].apply(stemmer.stem)
data["content_stemmed"] = [' '.join(x.split()) for x in data["content"]]

#convert the words in sentences to a new dataframe of vectors
vectorizer = TfidfVectorizer()
content_vect = vectorizer.fit_transform(data.content_stemmed)
vector_df = pd.DataFrame.sparse.from_spmatrix(content_vect.tocoo(), columns = vectorizer.get_feature_names())

In [8]:
#add label to the vectorized dataframe
vector_df['sent_num'] = data['sent_num']

In [9]:
# create train and test sets to apply in models
train_X = vector_df.drop(columns='sent_num')[:16000]
train_y = vector_df['sent_num'][:16000]
test_X = vector_df.drop(columns='sent_num')[16000:]
test_y = vector_df['sent_num'][16000:]

# Models predictions

### SupportVectorMachine

In [10]:
from sklearn.svm import LinearSVC
LSVC = LinearSVC(multi_class='crammer_singer')
LSVC.fit(train_X, train_y)



LinearSVC(multi_class='crammer_singer')

In [11]:
LSVC.score(train_X, train_y)

0.98175

In [12]:
LSVC.score(test_X, test_y)

0.8965

In [13]:
emotion_detect = 'emotion_detect_model.sav'
pickle.dump(LSVC, open(emotion_detect, 'wb'))