In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import seaborn as sns
%matplotlib inline

pd.options.mode.use_inf_as_na = True
sns.set(rc={'figure.figsize':(11.7,8.27)})

BASEDIR = os.path.dirname(os.path.dirname(os.path.abspath('README.md')))
DATAPATH = os.path.join(BASEDIR, 'data')
CHECKPOINT_PATH = os.path.join(BASEDIR, 'checkpoints')


In [3]:
df = pd.read_csv(os.path.join(DATAPATH, 'ISEAR_dataset.csv'), names=['#', 'emotions', 'texts'])
df.head()

Unnamed: 0,#,emotions,texts
0,0,joy,On days when I feel close to my partner and ot...
1,1,fear,Every time I imagine that someone I love or I ...
2,2,anger,When I had been obviously unjustly treated and...
3,3,sadness,When I think about the short time that we live...
4,4,disgust,At a gathering I found myself involuntarily si...


In [4]:
df['emotions'].unique()

array(['joy', 'fear', 'anger', 'sadness', 'disgust', 'shame', 'guilt'],
      dtype=object)

In [10]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer


tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['texts'])

In [12]:
tokenizer.word_index

{'i': 1,
 'a': 2,
 'the': 3,
 'my': 4,
 'to': 5,
 'and': 6,
 'was': 7,
 'when': 8,
 'of': 9,
 'in': 10,
 'had': 11,
 'me': 12,
 'that': 13,
 'for': 14,
 'with': 15,
 'not': 16,
 'it': 17,
 'at': 18,
 'on': 19,
 'he': 20,
 'very': 21,
 'friend': 22,
 'felt': 23,
 'an': 24,
 'she': 25,
 'her': 26,
 'we': 27,
 'one': 28,
 'about': 29,
 'as': 30,
 'this': 31,
 'after': 32,
 'from': 33,
 'by': 34,
 'time': 35,
 'were': 36,
 'did': 37,
 'out': 38,
 'but': 39,
 'who': 40,
 'him': 41,
 'because': 42,
 'his': 43,
 'been': 44,
 'got': 45,
 'have': 46,
 'which': 47,
 'some': 48,
 'home': 49,
 'mother': 50,
 'up': 51,
 'friends': 52,
 'told': 53,
 'would': 54,
 'so': 55,
 'day': 56,
 'they': 57,
 'do': 58,
 'school': 59,
 'be': 60,
 'people': 61,
 'went': 62,
 'there': 63,
 'saw': 64,
 'is': 65,
 'person': 66,
 'first': 67,
 'our': 68,
 'father': 69,
 'came': 70,
 'could': 71,
 'all': 72,
 'mine': 73,
 'night': 74,
 'go': 75,
 'made': 76,
 "didn't": 77,
 'girl': 78,
 'having': 79,
 'back': 80,
 'c

In [52]:
len(tokenizer.word_index)

9065

In [50]:
tokenizer.texts_to_matrix([df['texts'][0]]).shape

(1, 9066)

In [53]:
df.shape

(7446, 3)

## Tokenizer test

In [44]:
tok = Tokenizer()
tok.fit_on_texts(["this very long comment is not toxic"]) 

array([[0., 1., 1., ..., 0., 0., 0.]])

In [46]:

print(tok.texts_to_sequences(["this comment is not toxic"])) 
print(tok.texts_to_sequences(["this very long comment is not toxic"]))

[[1, 4, 5, 6, 7]]
[[1, 2, 3, 4, 5, 6, 7]]


In [47]:

print(tok.texts_to_matrix(["this comment is not toxic"])) 
print(tok.texts_to_matrix(["this very long comment is not toxic"]))

[[0. 1. 0. 0. 1. 1. 1. 1.]]
[[0. 1. 1. 1. 1. 1. 1. 1.]]


## Using NLTK to remove stop words

In [54]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

In [55]:
word_tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


def _tokenize(text):
    tokens = word_tokenizer.tokenize(text)
    tokens = [w.lower() for w in tokens if not w.lower() in stop_words]
    lems = []
    for item in tokens:
        lems.append(lemmatizer.lemmatize(item))
    return ' '.join(lems)

In [58]:
_tokenize('apple is very tasty.')

'apple tasty'

In [59]:
df['clean_texts'] = df['texts'].apply(_tokenize)

In [62]:
df['texts'][0]

'On days when I feel close to my partner and other friends.   \nWhen I feel at peace with myself and also experience a close  \ncontact with people whom I regard greatly.'

In [61]:
df['clean_texts'][0]

'day feel close partner friend feel peace also experience close contact people regard greatly'

In [63]:
clean_tokenizer = Tokenizer()
clean_tokenizer.fit_on_texts(df['clean_texts'])

In [65]:
clean_tokenizer.texts_to_matrix([df['clean_texts'][0]]).shape

(1, 8014)

# ML models

In [148]:
train_data, train_label, test_data, test_label = np.load(
        os.path.join(CHECKPOINT_PATH, "frozen_data.npy"), allow_pickle=True
    )

In [149]:
train_data[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [150]:
train_label.shape

(5956, 7)

In [151]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression


clf = OneVsRestClassifier(MultinomialNB())

In [152]:
clf.fit(train_data, train_label)

OneVsRestClassifier(estimator=MultinomialNB(alpha=1.0, class_prior=None,
                                            fit_prior=True),
                    n_jobs=None)

In [153]:
preds = clf.predict(test_data)

In [154]:
from sklearn.metrics import f1_score, accuracy_score

f1_score(preds, test_label, average='weighted')

0.5645226084039154

In [155]:
accuracy_score(preds, test_label)

0.4167785234899329