In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import seaborn as sns
%matplotlib inline

pd.options.mode.use_inf_as_na = True
sns.set(rc={'figure.figsize':(11.7,8.27)})

BASEDIR = os.path.dirname(os.path.dirname(os.path.abspath('README.md')))
DATAPATH = os.path.join(BASEDIR, 'data', "raw")
CHECKPOINT_PATH = os.path.join(BASEDIR, 'checkpoints')


  import pandas.util.testing as tm


In [2]:
df = pd.read_csv(os.path.join(DATAPATH, 'ISEAR_dataset.csv'), names=['#', 'emotions', 'texts'])
df.head()

Unnamed: 0,#,emotions,texts
0,0,joy,On days when I feel close to my partner and ot...
1,1,fear,Every time I imagine that someone I love or I ...
2,2,anger,When I had been obviously unjustly treated and...
3,3,sadness,When I think about the short time that we live...
4,4,disgust,At a gathering I found myself involuntarily si...


In [3]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(df, random_state=64, test_size=0.3)

In [None]:
test_data.to_csv('test_dataset.csv')

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
train_data['e'] = encoder.fit_transform(train_data['emotions'])

In [None]:
values = [0,1,2,3,4,5,6]
mapper = dict(zip( values, encoder.classes_))

In [None]:
inv_mapper =  dict(zip( encoder.classes_, values))

In [None]:
inv_mapper

In [None]:
new_df = train_data[[ 'e', 'texts']]

In [None]:
def remove_new_lines(text):
    text = text.replace('\n','')
    text = text.replace('\t', '')
    return text

In [None]:
new_df['texts'] = new_df['texts'].apply(remove_new_lines)

In [None]:
new_df['texts']

In [None]:
new_df.to_csv(r'dataset.txt', header=None, index=None, sep='\t', mode='a')

In [None]:
data = pd.read_csv('eda_dataset.txt', header = None, sep='\t', names=['emotions', 'texts'])

In [None]:
data['emotions'] = data['emotions'].map(mapper)

In [None]:
data.head()

In [None]:
data['emotions'].unique()

In [None]:
data.to_csv('train_dataset.csv')

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer


tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['texts'])

In [None]:
tokenizer.word_index

In [None]:
embeddings_index = {}
f = open((os.path.join(DATAPATH, 'glove.6B.100d.txt')), encoding='utf8')

for line in f:
    values = line.split()
    word = values[0]
    embeddings_index[word] = np.asarray(values[1:], dtype='float32')
          
f.close()
          

embedding_matrix = np.random.random((len(tokenizer.word_index) + 1, 100))
          
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
from tensorflow.keras import regularizers, initializers, optimizers, callbacks
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import *
from tensorflow.keras.models import Sequential
from tensorflow.keras import regularizers
import tensorflow as tf

In [None]:
model = Sequential()
model.add(Input(shape=(100,), dtype='int32'))
model.add(Embedding(len(tokenizer.word_index) + 1,
    100,
    weights = [embedding_matrix],
    input_length = 100,
    trainable=False,
    name = 'embeddings'))
model.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dropout(0.5))
# model.add(Dense(50, activation='relu', ))
# model.add(Dropout(0.5))
model.add(Dense(17, activation='sigmoid'))

In [None]:
# model = tf.keras.models.Sequential()
# model.add(tf.keras.layers.Embedding(100, 128))
# model.add(tf.keras.layers.LSTM(128, dropout=0.2, recurrent_dropout=0.2))
# model.add(tf.keras.layers.Dense(1, activation="sigmoid"))

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

In [None]:
word_tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


def _tokenize(text):
    tokens = word_tokenizer.tokenize(text)
    tokens = [w.lower() for w in tokens if not w.lower() in stop_words]
    lems = []
    for item in tokens:
        lems.append(lemmatizer.lemmatize(item))
    return ' '.join(lems)

In [None]:
_tokenize('apple is very tasty.')

In [None]:
df['clean_texts'] = df['texts'].apply(_tokenize)

In [None]:
df['texts'][0]

In [None]:
df['clean_texts'][0]

In [None]:
clean_tokenizer = Tokenizer()
clean_tokenizer.fit_on_texts(df['texts'])

In [None]:
tokenied_data = clean_tokenizer.texts_to_sequences(df['texts'])

In [None]:
tokenied_data = pad_sequences(tokenied_data, padding='post', maxlen=100)

In [None]:
X_train = tokenied_data[:6000]
X_test = tokenied_data[6000:]

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

encoder = MultiLabelBinarizer()
labels = encoder.fit_transform(df['emotions'].values)

In [None]:
label_df = pd.get_dummies(df['emotions'])

In [None]:
y_train = labels[:6000]
y_test = labels[6000:]

In [None]:
model.compile(
    loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"]
)


In [None]:
# history = model.fit(X_train, y_train, epochs = 100, batch_size=64, validation_data=(X_test, y_test))

## Tokenizer test

In [None]:
tok = Tokenizer()
tok.fit_on_texts(["this very long comment is not toxic"]) 

In [None]:

print(tok.texts_to_sequences(["this comment is not toxic"])) 
print(tok.texts_to_sequences(["this very long comment is not toxic"]))

In [None]:

print(tok.texts_to_matrix(["this comment is not toxic"])) 
print(tok.texts_to_matrix(["this very long comment is not toxic"]))

## Using NLTK to remove stop words

In [None]:
len(max(tokenied_data, key=len))

In [None]:
tokenied_data

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

encoder = MultiLabelBinarizer()
labels = encoder.fit_transform(df['emotions'].values)

In [None]:
df['emotions'].unique()

# ML models

In [3]:
train_data, train_label, test_data, test_label = np.load(
        os.path.join(CHECKPOINT_PATH, "frozen_data/SKLEARN-data-6000.npy"), allow_pickle=True
    )

In [4]:
train_data

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [5]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.svm import SVC
from nbsvm import NBSVMClassifier

In [6]:
clf = NBSVMClassifier() # initialize the model
clf.fit(train_data, train_label) # train the classifier; y{0,1}

NBSVMClassifier()

In [7]:
clf.predict(test_data) # get binary predictions

array([[1., 0., 0., ..., 1., 1., 0.],
       [1., 1., 1., ..., 1., 0., 0.],
       [1., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 1., ..., 1., 0., 0.],
       [1., 0., 0., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 0., 0.]])

In [15]:
import pickle



In [21]:

with open(
    os.path.join(
        CHECKPOINT_PATH, "sklearn_models", "naive_bayes-model.pkl"
    ),
    "rb",
) as f:
    model = pickle.load(f)

In [22]:
model

MultinomialNB(alpha=0.9, fit_prior=False)

In [23]:
preds = model.predict(test_data)

In [24]:
from sklearn.metrics import classification_report

In [25]:
print(classification_report(preds, test_label))

              precision    recall  f1-score   support

           0       0.44      0.44      0.44       210
           1       0.56      0.69      0.62       191
           2       0.62      0.65      0.63       209
           3       0.53      0.49      0.51       213
           4       0.71      0.67      0.69       234
           5       0.63      0.59      0.61       224
           6       0.50      0.46      0.48       209

    accuracy                           0.57      1490
   macro avg       0.57      0.57      0.57      1490
weighted avg       0.57      0.57      0.57      1490



In [None]:
gscv.best_score_

In [None]:
gscv.best_params_

In [None]:
from sklearn.metrics import f1_score, accuracy_score

f1_score(preds, test_label, average='weighted')

In [None]:
accuracy_score(preds, test_label)

# Explainer test

In [None]:
np.reshape(clf.classes_, (-1, 1))

In [None]:
from lime.lime_text import LimeTextExplainer

explainer = LimeTextExplainer(class_names=np.reshape(clf.classes_, (-1, 1)))
exp = explainer.explain_instance(
    "i love you", clf.predict_proba, num_features=7
)
exp