In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import seaborn as sns
%matplotlib inline

pd.options.mode.use_inf_as_na = True
sns.set(rc={'figure.figsize':(11.7,8.27)})

BASEDIR = os.path.dirname(os.path.dirname(os.path.abspath('README.md')))
DATAPATH = os.path.join(BASEDIR, 'data', "raw")
CHECKPOINT_PATH = os.path.join(BASEDIR, 'checkpoints')


  import pandas.util.testing as tm


In [2]:
df = pd.read_csv(os.path.join(DATAPATH, 'ISEAR_dataset.csv'), names=['#', 'emotions', 'texts'])
df.head()

Unnamed: 0,#,emotions,texts
0,0,joy,On days when I feel close to my partner and ot...
1,1,fear,Every time I imagine that someone I love or I ...
2,2,anger,When I had been obviously unjustly treated and...
3,3,sadness,When I think about the short time that we live...
4,4,disgust,At a gathering I found myself involuntarily si...


In [83]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(df, random_state=64, test_size=0.3)

In [85]:
test_data.to_csv('test_dataset.csv')

In [89]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
train_data['e'] = encoder.fit_transform(train_data['emotions'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [90]:
values = [0,1,2,3,4,5,6]
mapper = dict(zip( values, encoder.classes_))

In [105]:
inv_mapper =  dict(zip( encoder.classes_, values))

In [106]:
inv_mapper

{'anger': 0,
 'disgust': 1,
 'fear': 2,
 'guilt': 3,
 'joy': 4,
 'sadness': 5,
 'shame': 6}

In [91]:
new_df = train_data[[ 'e', 'texts']]

In [92]:
def remove_new_lines(text):
    text = text.replace('\n','')
    text = text.replace('\t', '')
    return text

In [93]:
new_df['texts'] = new_df['texts'].apply(remove_new_lines)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [97]:
new_df['texts']

5693    When I dated my friend's boyfriend after they ...
6457    When I failed my Standard 8 Examination for th...
5262    When my cousin's wife refused him sex and clai...
778     My father was a member of the public Alcohol a...
5854    When my mother scolded me for something, very ...
                              ...                        
54                              Railway station good-bye.
3238    My sister and I were fighting as usual. She's ...
6263    The sadness came to me when I heard that my gi...
5094            I was scared of water when I was a child.
6596    When I heard that I had passed my end of first...
Name: texts, Length: 5212, dtype: object

In [98]:
new_df.to_csv(r'dataset.txt', header=None, index=None, sep='\t', mode='a')

In [109]:
data = pd.read_csv('eda_dataset.txt', header = None, sep='\t', names=['emotions', 'texts'])

In [110]:
data['emotions'] = data['emotions'].map(mapper)

In [111]:
data.head()

Unnamed: 0,emotions,texts
0,shame,had boyfriend dated broken they i after friend...
1,shame,when i date stamp my acquaintance beau after t...
2,shame,later when i dated my friends boyfriend upward...
3,shame,when i dated my friends boyfriend after they h...
4,sadness,failed standard the first time


In [112]:
data['emotions'].unique()

array(['shame', 'sadness', 'joy', 'fear', 'disgust', 'guilt', 'anger'],
      dtype=object)

In [113]:
data.to_csv('train_dataset.csv')

In [4]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer


tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['texts'])

In [5]:
tokenizer.word_index

{'i': 1,
 'a': 2,
 'the': 3,
 'my': 4,
 'to': 5,
 'and': 6,
 'was': 7,
 'when': 8,
 'of': 9,
 'in': 10,
 'had': 11,
 'me': 12,
 'that': 13,
 'for': 14,
 'with': 15,
 'not': 16,
 'it': 17,
 'at': 18,
 'on': 19,
 'he': 20,
 'very': 21,
 'friend': 22,
 'felt': 23,
 'an': 24,
 'she': 25,
 'her': 26,
 'we': 27,
 'one': 28,
 'about': 29,
 'as': 30,
 'this': 31,
 'after': 32,
 'from': 33,
 'by': 34,
 'time': 35,
 'were': 36,
 'did': 37,
 'out': 38,
 'but': 39,
 'who': 40,
 'him': 41,
 'because': 42,
 'his': 43,
 'been': 44,
 'got': 45,
 'have': 46,
 'which': 47,
 'some': 48,
 'home': 49,
 'mother': 50,
 'up': 51,
 'friends': 52,
 'told': 53,
 'would': 54,
 'so': 55,
 'day': 56,
 'they': 57,
 'do': 58,
 'school': 59,
 'be': 60,
 'people': 61,
 'went': 62,
 'there': 63,
 'saw': 64,
 'is': 65,
 'person': 66,
 'first': 67,
 'our': 68,
 'father': 69,
 'came': 70,
 'could': 71,
 'all': 72,
 'mine': 73,
 'night': 74,
 'go': 75,
 'made': 76,
 "didn't": 77,
 'girl': 78,
 'having': 79,
 'back': 80,
 'c

In [6]:
embeddings_index = {}
f = open((os.path.join(DATAPATH, 'glove.6B.100d.txt')), encoding='utf8')

for line in f:
    values = line.split()
    word = values[0]
    embeddings_index[word] = np.asarray(values[1:], dtype='float32')
          
f.close()
          

embedding_matrix = np.random.random((len(tokenizer.word_index) + 1, 100))
          
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [7]:
from tensorflow.keras import regularizers, initializers, optimizers, callbacks
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import *
from tensorflow.keras.models import Sequential
from tensorflow.keras import regularizers
import tensorflow as tf

In [20]:
model = Sequential()
model.add(Input(shape=(100,), dtype='int32'))
model.add(Embedding(len(tokenizer.word_index) + 1,
    100,
    weights = [embedding_matrix],
    input_length = 100,
    trainable=False,
    name = 'embeddings'))
model.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dropout(0.5))
# model.add(Dense(50, activation='relu', ))
# model.add(Dropout(0.5))
model.add(Dense(17, activation='sigmoid'))



In [9]:
# model = tf.keras.models.Sequential()
# model.add(tf.keras.layers.Embedding(100, 128))
# model.add(tf.keras.layers.LSTM(128, dropout=0.2, recurrent_dropout=0.2))
# model.add(tf.keras.layers.Dense(1, activation="sigmoid"))

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

In [None]:
word_tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


def _tokenize(text):
    tokens = word_tokenizer.tokenize(text)
    tokens = [w.lower() for w in tokens if not w.lower() in stop_words]
    lems = []
    for item in tokens:
        lems.append(lemmatizer.lemmatize(item))
    return ' '.join(lems)

In [None]:
_tokenize('apple is very tasty.')

In [None]:
df['clean_texts'] = df['texts'].apply(_tokenize)

In [None]:
df['texts'][0]

In [None]:
df['clean_texts'][0]

In [10]:
clean_tokenizer = Tokenizer()
clean_tokenizer.fit_on_texts(df['texts'])

In [11]:
tokenied_data = clean_tokenizer.texts_to_sequences(df['texts'])

In [12]:
tokenied_data = pad_sequences(tokenied_data, padding='post', maxlen=100)

In [13]:
X_train = tokenied_data[:6000]
X_test = tokenied_data[6000:]

In [14]:
from sklearn.preprocessing import MultiLabelBinarizer

encoder = MultiLabelBinarizer()
labels = encoder.fit_transform(df['emotions'].values)

In [None]:
label_df = pd.get_dummies(df['emotions'])

In [15]:
y_train = labels[:6000]
y_test = labels[6000:]

In [21]:
model.compile(
    loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"]
)


In [None]:
# history = model.fit(X_train, y_train, epochs = 100, batch_size=64, validation_data=(X_test, y_test))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100

## Tokenizer test

In [None]:
tok = Tokenizer()
tok.fit_on_texts(["this very long comment is not toxic"]) 

In [None]:

print(tok.texts_to_sequences(["this comment is not toxic"])) 
print(tok.texts_to_sequences(["this very long comment is not toxic"]))

In [None]:

print(tok.texts_to_matrix(["this comment is not toxic"])) 
print(tok.texts_to_matrix(["this very long comment is not toxic"]))

## Using NLTK to remove stop words

In [None]:
len(max(tokenied_data, key=len))

In [None]:
tokenied_data

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

encoder = MultiLabelBinarizer()
labels = encoder.fit_transform(df['emotions'].values)

In [None]:
df['emotions'].unique()

# ML models

In [3]:
train_data, train_label, test_data, test_label = np.load(
        os.path.join(CHECKPOINT_PATH, "frozen_data/SKLEARN-data-7000.npy"), allow_pickle=True
    )

In [4]:
train_data.shape

(5584, 7000)

In [None]:
train_data

In [5]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold

folds = StratifiedKFold(n_splits=10, random_state=32)
clf = MultinomialNB()

param_grid = {'alpha': np.arange(1, 10, 1), 'fit_prior': [True, False]}

gscv = GridSearchCV(clf, cv=folds, param_grid=param_grid )

In [10]:
clf.fit(train_data, train_label)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [11]:
preds = clf.predict(test_data)

In [8]:
gscv.best_score_

0.5580229226361032

In [9]:
gscv.best_params_

{'alpha': 1, 'fit_prior': False}

In [12]:
from sklearn.metrics import f1_score, accuracy_score

f1_score(preds, test_label, average='weighted')

0.5631870061905447

In [13]:
accuracy_score(preds, test_label)

0.5644468313641245

# Explainer test

In [23]:
np.reshape(clf.classes_, (-1, 1))

array([[0],
       [1],
       [2],
       [3],
       [4],
       [5],
       [6]])

In [22]:
from lime.lime_text import LimeTextExplainer

explainer = LimeTextExplainer(class_names=np.reshape(clf.classes_, (-1, 1)))
exp = explainer.explain_instance(
    "i love you", clf.predict_proba, num_features=7
)
exp

ValueError: Expected 2D array, got 1D array instead:
array=['i love you' 'i  you' '  ' ... 'i love ' 'i  you' ' love you'].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.