In [3]:
# We will use ktrain for our experiment which is a library for plug and play
!pip3 install ktrain

Collecting ktrain
[?25l  Downloading https://files.pythonhosted.org/packages/77/8e/723ccd3b7ba280a8a13caef126433bf9039a65ac18c72c10798f07953f24/ktrain-0.15.0.tar.gz (25.2MB)
[K     |████████████████████████████████| 25.2MB 1.5MB/s 
[?25hCollecting tensorflow==2.1.0
[?25l  Downloading https://files.pythonhosted.org/packages/85/d4/c0cd1057b331bc38b65478302114194bd8e1b9c2bbc06e300935c0e93d90/tensorflow-2.1.0-cp36-cp36m-manylinux2010_x86_64.whl (421.8MB)
[K     |████████████████████████████████| 421.8MB 35kB/s 
Collecting keras_bert>=0.81.0
  Downloading https://files.pythonhosted.org/packages/2c/0f/cdc886c1018943ea62d3209bc964413d5aa9d0eb7e493abd8545be679294/keras-bert-0.81.0.tar.gz
Collecting langdetect
[?25l  Downloading https://files.pythonhosted.org/packages/56/a3/8407c1e62d5980188b4acc45ef3d94b933d14a2ebc9ef3505f22cf772570/langdetect-1.0.8.tar.gz (981kB)
[K     |████████████████████████████████| 983kB 35.0MB/s 
Collecting cchardet==2.1.5
[?25l  Downloading https://files.pytho

In [4]:
import ktrain
from ktrain import text
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from joblib import dump, load # for saving and loading sklearn object
from scipy.sparse import save_npz, load_npz # used for saving and loading sparse matrices
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import os

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [5]:
df = pd.read_csv('IESAR_onehot.csv')
df.head()

Unnamed: 0,text,anger,disgust,fear,guilt,joy,sadness,shame
0,on day when feel close to my partner and other...,0,0,0,0,1,0,0
1,every time imagine that someone love or could ...,0,0,1,0,0,0,0
2,when had been obviously unjustly treated and h...,1,0,0,0,0,0,0
3,when think about the short time that we live a...,0,0,0,0,0,1,0
4,at gathering found myself involuntarily sittin...,0,1,0,0,0,0,0


In [0]:
def preprocess(X):
    documents = []
    stemmer = WordNetLemmatizer()

    for sen in range(0, len(X)):
        # Remove all the special characters
        document = re.sub(r'\W', ' ', str(X[sen]))

        # remove all single characters
        document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

        # Remove single characters from the start
        document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 

        # Substituting multiple spaces with single space
        document = re.sub(r'\s+', ' ', document, flags=re.I)

        # Removing prefixed 'b'
        document = re.sub(r'^b\s+', '', document)

        # Converting to Lowercase
        document = document.lower()

        # Lemmatization
        document = document.split()

        document = [stemmer.lemmatize(word) for word in document]
        document = ' '.join(document)

        documents.append(document)
    return documents

df['text'] = preprocess(df['text'])

In [7]:
(x_train, y_train), (x_test, y_test), preproc = text.texts_from_csv('IESAR_onehot.csv',
                      'text',
                      label_columns = ['anger', 'disgust', 'guilt', 'joy', 'sadness' ,'shame'],
                      val_filepath=None, # if None, 10% of data will be used for validation
                    #  max_features=NUM_WORDS, maxlen=MAXLEN,
                      ngram_range=1, maxlen = 63)

detected encoding: utf-8 (if wrong, set manually)
language: en
Word Counts: 7812
Nrows: 6701
6701 train sequences
train sequence lengths:
	mean : 20
	95percentile : 44
	99percentile : 64
x_train shape: (6701,63)
y_train shape: (6701, 6)
745 test sequences
test sequence lengths:
	mean : 19
	95percentile : 41
	99percentile : 60
x_test shape: (745,63)
y_test shape: (745, 6)


NBSVM takes linear model such as SVM and infuses it with Bayesian probabilistics by replacing word count features with Naive Bayes log count ration. Despite it's simplicity, NBSVM model has been shown to be fast and powerful across wide range of different text classification datasets.

In [41]:
# Here we will use nbsvm for our case
model = text.text_classifier('nbsvm', (x_train, y_train), preproc = preproc)
learner = ktrain.get_learner(model, train_data = (x_train, y_train), val_data = (x_test, y_test))

Is Multi-Label? False
compiling word ID features...
maxlen is 63
building document-term matrix... this may take a few moments...
rows: 1-6701
computing log-count ratios...
done.


In [47]:
learner.fit(0.0001, 4, cycle_len=1, cycle_mult=2)

Train on 6701 samples, validate on 745 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7ffa6de41828>

In [0]:


predictor = ktrain.get_predictor(learner.model, preproc)



In [22]:
data = [ 'I don\'t like his behaviour',
         'I am feeling injustice',
        'He is a bad person, I don\'t like him',
        'I am the happiest person in the world', 
        'happy']
predictor.predict(data)

['disgust', 'guilt', 'disgust', 'joy', 'joy']

In [23]:
predictor.get_classes()

['anger', 'disgust', 'guilt', 'joy', 'sadness', 'shame']

In [24]:
predictor.predict(data, return_proba = True)

array([[0.14861122, 0.28352705, 0.19256656, 0.09973159, 0.11794614,
        0.15761743],
       [0.17270799, 0.14613137, 0.18589257, 0.1603774 , 0.15224856,
        0.18264215],
       [0.16476646, 0.22777674, 0.18955177, 0.10602947, 0.16224709,
        0.14962848],
       [0.14680876, 0.18596506, 0.13284178, 0.23278394, 0.12897846,
        0.17262189],
       [0.09016552, 0.09485937, 0.10876507, 0.5030992 , 0.12524392,
        0.07786693]], dtype=float32)

In [30]:
y_pred = predictor.predict(x_test)

AttributeError: ignored

In [37]:
a = list(df.head()['text'].values)
df.head()

Unnamed: 0,text,anger,disgust,fear,guilt,joy,sadness,shame
0,on day when feel close to my partner and other...,0,0,0,0,1,0,0
1,every time imagine that someone love or could ...,0,0,1,0,0,0,0
2,when had been obviously unjustly treated and h...,1,0,0,0,0,0,0
3,when think about the short time that we live a...,0,0,0,0,0,1,0
4,at gathering found myself involuntarily sittin...,0,1,0,0,0,0,0


In [36]:
predictor.predict(a)

['joy', 'sadness', 'anger', 'sadness', 'disgust']