Import some of the main packages and set the memory limit of the GPU

In [102]:
import pandas as pd
import nltk
import numpy as np

import tensorflow as tf
physical_devices = tf.config.experimental.list_physical_devices('GPU')
assert len(physical_devices) > 0, "Not enough GPU hardware devices available"
# config = tf.config.experimental.set_memory_growth(physical_devices[0], True)
tf.config.experimental.set_virtual_device_configuration(physical_devices[0], [
tf.config.experimental.VirtualDeviceConfiguration(memory_limit=2000)])

# Load the Data

Load the CSV dataset using pandas library and remove Nan values if there is any

In [308]:
file_name = 'data/Travel-Dataset-5000--master/5000TravelQuestionsDataset.xlsx'
test_id  = 1
col_names = ['questions', 'a', 'b']
dataset = pd.read_excel(file_name, header=None, names=col_names)
dataset['questions'].dropna(inplace=True)
print(dataset.info())
print(dataset['a'].unique())
print(len(dataset['b'].unique()))
print(dataset['questions'][test_id])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   questions  5000 non-null   object
 1   a          5000 non-null   object
 2   b          5000 non-null   object
dtypes: object(3)
memory usage: 117.3+ KB
None
['TTD' 'TGU' 'ACM' 'TRS' 'WTH' 'FOD' 'ENT' 'TGU\n' 'TTD\n' '\nENT']
79
What are the companies which organize shark feeding events for scuba divers?


# PreProcessing

## Do basic preprocessing
- Removing HTML tags
- Removing punctuation
- Lowering text

In [104]:
import re

def clean_text(text):
    """
    Applies some pre-processing on the given text.

    Steps :
    - Removing HTML tags
    - Removing punctuation
    - Lowering text
    """
    
    # remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # remove the characters [\], ['] and ["]
    text = re.sub(r"\\", "", text)    
    text = re.sub(r"\'", "", text)    
    text = re.sub(r"\"", "", text)    
    
    # convert text to lowercase
    text = text.strip().lower()
    
    # replace punctuation characters with spaces
    filters='!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    translate_dict = dict((c, " ") for c in filters)
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)

    return text

dataset['clean_questions'] = [clean_text(question) for question in dataset['questions']]
dataset['clean_questions'][test_id]

'what are the companies which organize shark feeding events for scuba divers '

In [105]:
import string

def remove_punctuation(x):
    return "".join([char for char in x if char not in string.punctuation])

# dataset['questions'] = [remove_punctuation(question) for question in dataset['questions']]
# dataset['questions'][test_id]

### Remove stop words other than the first word then remove punctuations

In [106]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
from nltk import word_tokenize

def remove_stopwords(x):
    words = word_tokenize(x)
    return " ".join([word for pos, word in enumerate(words) if (pos < 1) or (word not in stop_words)])

dataset['questions_stop'] = [remove_punctuation(remove_stopwords(question)) for question in dataset['questions']]
print(dataset['questions'][test_id])
print(dataset['questions_stop'][test_id])
# print(stop_words)

What are the companies which organize shark feeding events for scuba divers?
What companies organize shark feeding events scuba divers 


# Create features

## Lemmatize and create BOW

In [107]:
from nltk.stem import WordNetLemmatizer 
  
lemmatizer = WordNetLemmatizer()
def lemmatize(x):
    filtered_words = nltk.word_tokenize(x)
    lemmatized = [lemmatizer.lemmatize(word) for word in filtered_words]
    return " ".join(lemmatized)

dataset['lem_questions'] = [lemmatize(question) for question in dataset['clean_questions']]
dataset['lem_questions'][test_id]

'what are the company which organize shark feeding event for scuba diver'

## POS Tags

In [108]:
from nltk import pos_tag
def pos_tagging(x):
    words = nltk.word_tokenize(x)
    lst = [ r[1] for r in pos_tag(words)] 
    return ' '.join(lst)

dataset['pos_questions'] = [pos_tagging(question) for question in dataset['questions']]
dataset['pos_questions'][test_id]

'WP VBP DT NNS WDT VBP NN NN NNS IN NN NNS .'

## Naming Entities

In [109]:
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree

def get_continuous_chunks(text):
    chunked = ne_chunk(pos_tag(word_tokenize(text)))
    prev = None
    continuous_chunk = []
    current_chunk = []

    for i in chunked:
        if type(i) == Tree:
            current_chunk.append(" ".join([token for token, pos in i.leaves()]))
        elif current_chunk:
            named_entity = " ".join(current_chunk)
            if named_entity not in continuous_chunk:
                continuous_chunk.append(named_entity)
                current_chunk = []
        else:
            continue

    if continuous_chunk:
        named_entity = " ".join(current_chunk)
        if named_entity not in continuous_chunk:
            continuous_chunk.append(named_entity)
    
    def remove_null(x):
        if '' in x:
            x.remove('')
        return x

    lst = remove_null(continuous_chunk)
    return ' '.join(lst)

txt = "Barack Obama is a great person." 
txt2 = "Who is Dulan?"
print (get_continuous_chunks(txt2))



dataset['ne_questions'] = [get_continuous_chunks(question) for question in dataset['questions']]
dataset['ne_questions'][test_id]

Dulan


''

## Count vectorizer (BOW)
Remove stop words and vectorize the content

In [110]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

def get_count_vect(documents):
    vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7)#, stop_words=stopwords.words('english')
    X = vectorizer.fit_transform(documents).toarray()
    print(X.shape)
    return X

print(get_count_vect(dataset['questions']))

(5000, 1177)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


## Head word feature
Extract the head word from the coupus

In [111]:
# Head word tokenizer
import spacy
nlp = spacy.load("en_core_web_sm")
def head_word_tokenizer(text):
    head_words = []
    for token in nlp(text):
        if token.dep_ == "nsubj" or token.dep_ == "nsubjpass":
            head_words.append(token.text)
#             head_words.append(token.head.text)
    return head_words

In [112]:
head_word_tokenizer("big red dog")

[]

In [113]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

head_words_vectorizer = CountVectorizer(tokenizer = head_word_tokenizer,max_features=100,stop_words=stopwords.words('english'))
head_words_vector = head_words_vectorizer.fit_transform(dataset["questions"].values).toarray()

## Head word Synonyms

In [114]:
from nltk.corpus import wordnet 

def get_syonyms(words):
    all_synonyms = []
    for word in words.split(' '):
        synonyms = []

        for syn in wordnet.synsets(word):
            for l in syn.lemmas():
                synonyms.append(l.name())

        synonyms = list(set(synonyms))
        all_synonyms += synonyms
        
    return all_synonyms

# print(get_syonyms("cat and dog"))

In [115]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

head_words_synonym_vectorizer = CountVectorizer(tokenizer = get_syonyms,max_features=100,stop_words=stopwords.words('english'))
head_words_synonym_vector = head_words_synonym_vectorizer.fit_transform(dataset["questions"].values).toarray()

  'stop_words.' % sorted(inconsistent))


## Bigram

In [116]:
def bigram(x):
    new_words = ""
    pre_word = None
    for word in x.strip().split(' '):
        
        if pre_word is not None:
            new_words += "{}{} ".format(pre_word, word)
        pre_word = word
    return new_words[:-1]

# print(bigram("what are the company which organize shark feeding event for scuba divers "))

dataset['questions_bigram'] = [bigram(question) for question in dataset['questions_stop']]

print(dataset['questions'][test_id])
print(dataset['questions_bigram'][test_id])

What are the companies which organize shark feeding events for scuba divers?
Whatcompanies companiesorganize organizeshark sharkfeeding feedingevents eventsscuba scubadivers


In [None]:
# from sklearn.feature_extraction.text import CountVectorizer
# from nltk.corpus import stopwords

# head_words_vectorizer = CountVectorizer(tokenizer = bigram,max_features=10000)
# head_words_vector = head_words_vectorizer.fit_transform(dataset["questions"].values).toarray()

## Accuracy Evauluation on Different Matrices

In [117]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix


def PRC_matrics(y_test, prediction):
    # calculate prediction
    precision = precision_score(y_test, prediction, labels=[1,2], average='micro')*100
    print('Precision: %.3f' % precision)

    # calculate recall
    recall = recall_score(y_test, prediction, labels=[1,2], average='micro')*100
    print('Recall: %.3f' % recall)
    
    # calculate score
#     score = f1_score(y_test, prediction, average='micro')
    f1_score = 2 * (precision * recall) / (precision + recall)
    print('F-Measure: %.3f' % f1_score)
    
    
    acc = accuracy_score(y_test, prediction)*100
#     tn, fp, fn, tp = confusion_matrix([0, 1, 0, 1], [1, 1, 1, 0]).ravel()
#     acc2 = (tn + tp)*100/(tn + fp + fn + tp)
    print('Accuracy score: %.3f' % acc)
    
    
    cm = confusion_matrix(y_test, prediction)
    print("\nConfustion matrix: \n{}".format(cm))
    
    return precision, recall, f1_score, acc


### Get the y values

In [190]:
def get_encoded_y(feature_set='a'):
    le = LabelEncoder()
    y = le.fit_transform(dataset[feature_set])
    return y

### Default SVM training model

In [191]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


def train_with_svm(XX, y):
    best_prediction = None
    best_test = None
    best_accuracy = 0
    cv = KFold(n_splits=10, random_state=1, shuffle=True)
    fold = 0
    accuracies = []
    for train_index, test_index in cv.split(XX):
        fold += 1
        X_train, X_test = XX[train_index], XX[test_index]
        y_train, y_test = y[train_index], y[test_index]

        SVM = SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
        SVM.fit(X_train,y_train)
        predictions_SVM1 = SVM.predict(X_test)
        acc = accuracy_score(predictions_SVM1, y_test)*100
        if best_accuracy < acc:
            best_accuracy = acc
            best_prediction = predictions_SVM1
            best_test = y_test
            best_model = SVM
        accuracies.append(acc)
        print("Fold - {} - {} - {:.2f}".format(fold, "SVM Accuracy Score -> ",acc))

    print("Mean Accuracy {:.2f} \nStd Accuracy {:.2f}\n\n".format(np.mean(accuracies), np.std(accuracies)))
    
    print("Best accuracy : {}".format(best_accuracy))
    PRC_matrics(best_test, best_prediction)

# SVM - Normal train

# Task 1 - A traditional ML classifier s.a. SVM or Logistic Regression with at least 5  of the features mentioned in the paper.

### Used features

+ ST(BOW)
+ NE(BOW)
+ POS(BOW)
+ Head Word
+ Head Word Synonyms
+ ST(BG)

In [193]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

from scipy.sparse import coo_matrix, csr_matrix, hstack


y = get_encoded_y('a')

X_lem = dataset['lem_questions']
tfidf_lem = TfidfVectorizer(max_features=5000)
tfidf_lem.fit(X_lem)

X_pos = dataset['pos_questions']

X_ne = dataset['ne_questions']

X_bigram = dataset['questions_bigram']

XX = csr_matrix(hstack([tfidf_lem.transform(X_lem) ,get_count_vect(X_pos), get_count_vect(X_ne), get_count_vect(X_bigram),
                        head_words_vector, head_words_synonym_vector]))
XX_simple = csr_matrix(hstack([tfidf_lem.transform(X_lem)]))
XX.shape

(5000, 27)
(5000, 326)
(5000, 527)


(5000, 6080)

## Feature Selection

### 1 - Select K best features

In [152]:
from sklearn.feature_selection import SelectKBest, chi2

X_new = SelectKBest(chi2, k=3000).fit_transform(XX, y)
X_new.shape

(5000, 3000)

### 2 - Remove if variaence less than a specific threshold

In [153]:
from sklearn.feature_selection import VarianceThreshold
X_new_var = VarianceThreshold(threshold=(0.01)).fit_transform(XX)
X_new_var.shape
# selector.fit_transform(features)

(5000, 153)

### Only using ST(BOW)

In [194]:
train_with_svm(XX_simple, get_encoded_y('a'))

Fold - 1 - SVM Accuracy Score ->  - 83.20
Fold - 2 - SVM Accuracy Score ->  - 84.80
Fold - 3 - SVM Accuracy Score ->  - 81.40
Fold - 4 - SVM Accuracy Score ->  - 81.80
Fold - 5 - SVM Accuracy Score ->  - 83.40
Fold - 6 - SVM Accuracy Score ->  - 84.20
Fold - 7 - SVM Accuracy Score ->  - 83.40
Fold - 8 - SVM Accuracy Score ->  - 84.40
Fold - 9 - SVM Accuracy Score ->  - 82.80
Fold - 10 - SVM Accuracy Score ->  - 82.80
Mean Accuracy 83.22 
Std Accuracy 1.03


Best accuracy : 84.8
Precision: 91.304
Recall: 78.505
F-Measure: 84.422
Accuracy score: 84.800

Confustion matrix: 
[[ 72   0   0   5   1   4   0   0]
 [  0  12   0   4   0   9   0   0]
 [  2   0  35   0   0   2   0   0]
 [  4   0   1 108   5   6   0   0]
 [  0   0   0  10  81   3   0   0]
 [  2   0   1   6   2  99   0   0]
 [  0   0   0   0   0   1   0   0]
 [  0   0   0   6   0   2   0  17]]


In [195]:
train_with_svm(XX_simple, get_encoded_y('b'))

Fold - 1 - SVM Accuracy Score ->  - 65.40
Fold - 2 - SVM Accuracy Score ->  - 65.00
Fold - 3 - SVM Accuracy Score ->  - 64.00
Fold - 4 - SVM Accuracy Score ->  - 64.40
Fold - 5 - SVM Accuracy Score ->  - 65.80
Fold - 6 - SVM Accuracy Score ->  - 66.00
Fold - 7 - SVM Accuracy Score ->  - 63.20
Fold - 8 - SVM Accuracy Score ->  - 67.80
Fold - 9 - SVM Accuracy Score ->  - 64.80
Fold - 10 - SVM Accuracy Score ->  - 61.60
Mean Accuracy 64.80 
Std Accuracy 1.59


Best accuracy : 67.80000000000001
Precision: 0.000
Recall: 0.000
F-Measure: nan
Accuracy score: 67.800

Confustion matrix: 
[[ 0  0  0 ...  0  0  0]
 [ 0  5  0 ...  0  0  0]
 [ 0  0 26 ...  0  0  0]
 ...
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## SVM - Train with all the features

In [155]:
train_with_svm(XX, get_encoded_y('a'))

Fold - 1 - SVM Accuracy Score ->  - 75.80
Fold - 2 - SVM Accuracy Score ->  - 80.20
Fold - 3 - SVM Accuracy Score ->  - 77.80
Fold - 4 - SVM Accuracy Score ->  - 76.00
Fold - 5 - SVM Accuracy Score ->  - 77.40
Fold - 6 - SVM Accuracy Score ->  - 81.60
Fold - 7 - SVM Accuracy Score ->  - 78.20
Fold - 8 - SVM Accuracy Score ->  - 77.60
Fold - 9 - SVM Accuracy Score ->  - 76.00
Fold - 10 - SVM Accuracy Score ->  - 75.80
Mean Accuracy 77.64 
Std Accuracy 1.87


Best accuracy : 81.6
Precision: 77.215
Recall: 83.562
F-Measure: 80.263
Accuracy score: 81.600

Confustion matrix: 
[[ 53   0   0   0   0   6   0]
 [  1   8   0   0   0   5   0]
 [  5   0  40   0   1   4   0]
 [  5   0   5 111   8  19   0]
 [  2   1   0   8  94   4   1]
 [  2   2   1   9   1  86   0]
 [  0   0   0   1   1   0  16]]


In [156]:
train_with_svm(XX, get_encoded_y('b'))

Fold - 1 - SVM Accuracy Score ->  - 48.00
Fold - 2 - SVM Accuracy Score ->  - 50.00
Fold - 3 - SVM Accuracy Score ->  - 49.80
Fold - 4 - SVM Accuracy Score ->  - 49.20
Fold - 5 - SVM Accuracy Score ->  - 48.00
Fold - 6 - SVM Accuracy Score ->  - 47.00
Fold - 7 - SVM Accuracy Score ->  - 44.60
Fold - 8 - SVM Accuracy Score ->  - 48.20
Fold - 9 - SVM Accuracy Score ->  - 50.40
Fold - 10 - SVM Accuracy Score ->  - 46.60
Mean Accuracy 48.18 
Std Accuracy 1.69


Best accuracy : 50.4
Precision: 0.000
Recall: 0.000
F-Measure: nan
Accuracy score: 50.400

Confustion matrix: 
[[0 0 0 ... 0 0 0]
 [0 2 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 ...
 [0 0 0 ... 4 0 1]
 [0 0 0 ... 3 0 0]
 [0 0 0 ... 1 0 1]]


  _warn_prf(average, modifier, msg_start, len(result))


# Word embeddings

## Gensim Doc2Vec - Using Sentence Vectors

In [176]:
import smart_open
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.utils import simple_preprocess

def read_corpus(fname, tokens_only=False):
    with smart_open.open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            tokens = simple_preprocess(line)
            if tokens_only:
                yield tokens
            else:
                # For training data, add tags
                yield TaggedDocument(tokens, [i])
lee_train_file = 'questions.txt'
train_corpus = list(read_corpus(lee_train_file))

In [177]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(dataset['questions'])]

model = Doc2Vec(vector_size=500, min_count=2, epochs=40)
# model = Doc2Vec(documents, vector_size=1000, window=2, min_count=1, workers=4)
# train_corpus = dataset['questions'].values

model.build_vocab(train_corpus)
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

def doc2vec(x):
    return np.array(model.infer_vector(x.split(' ')))

X_doc2vec = np.array([doc2vec(question) for question in dataset['questions']])

In [178]:
doc2vec('hello world').shape

(500,)

### SVM Accuracy Coarse

In [179]:
train_with_svm(X_doc2vec, get_encoded_y('a'))

Fold - 1 - SVM Accuracy Score ->  - 49.80
Fold - 2 - SVM Accuracy Score ->  - 53.60
Fold - 3 - SVM Accuracy Score ->  - 51.60
Fold - 4 - SVM Accuracy Score ->  - 52.40
Fold - 5 - SVM Accuracy Score ->  - 51.60
Fold - 6 - SVM Accuracy Score ->  - 55.80
Fold - 7 - SVM Accuracy Score ->  - 52.20
Fold - 8 - SVM Accuracy Score ->  - 49.00
Fold - 9 - SVM Accuracy Score ->  - 52.60
Fold - 10 - SVM Accuracy Score ->  - 50.20
Mean Accuracy 51.88 
Std Accuracy 1.87


Best accuracy : 55.800000000000004
Precision: 52.381
Recall: 45.205
F-Measure: 48.529
Accuracy score: 55.800

Confustion matrix: 
[[33  0  5  6  4 10  1]
 [ 1  0  3  5  0  5  0]
 [ 2  1 23  9  2 13  0]
 [10  0  6 81 22 29  0]
 [ 6  0  2 21 68 13  0]
 [ 9  1  3 17  7 64  0]
 [ 0  0  0  3  1  4 10]]


### SVM Accuracy Fine

In [180]:
train_with_svm(X_doc2vec, get_encoded_y('b'))

Fold - 1 - SVM Accuracy Score ->  - 31.60
Fold - 2 - SVM Accuracy Score ->  - 35.40
Fold - 3 - SVM Accuracy Score ->  - 30.00
Fold - 4 - SVM Accuracy Score ->  - 31.20
Fold - 5 - SVM Accuracy Score ->  - 26.40
Fold - 6 - SVM Accuracy Score ->  - 30.60
Fold - 7 - SVM Accuracy Score ->  - 31.80
Fold - 8 - SVM Accuracy Score ->  - 27.00
Fold - 9 - SVM Accuracy Score ->  - 31.00
Fold - 10 - SVM Accuracy Score ->  - 31.60
Mean Accuracy 30.66 
Std Accuracy 2.41


Best accuracy : 35.4
Precision: 0.000
Recall: 0.000
F-Measure: nan
Accuracy score: 35.400

Confustion matrix: 
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## FastText

### Train fast text train and test model - this didnt give good accuracy

In [181]:
import fasttext
import os
model_name='fasttext_skipgram_travel_questions.bin'

if not os.path.exists(model_name):
    dataset['questions'].to_csv('questions.txt', sep='.', header=False, index=False)
    model_fasttext = fasttext.train_unsupervised('questions.txt', model='skipgram')
    model_fasttext.save_model(model_name)
    print("Model saved as {}".format(model_name))
else:
    print("\"{}\" - model loaded".format(model_name))
    model_fasttext = fasttext.load_model(model_name)

"fasttext_skipgram_travel_questions.bin" - model loaded


In [182]:
model_fasttext.words
len(model_fasttext.get_word_vector("the"))

100

In [230]:
def fast_text_mean_transform(X):
    words = X.split(' ')    
    return np.mean([model_fasttext.get_word_vector(w) for w in words if w in model_fasttext.words]
                    or [np.zeros(100)], axis=0)

In [231]:
def fast_text_first_x_words(X, length):
    from tensorflow.keras.preprocessing.sequence import pad_sequences

    X_lstm = pad_sequences(X, maxlen=length)
    
    words = X.split(' ')    
    return np.mean([ft.get_word_vector(w) for w in words if w in model_fasttext.words]
                    or [np.zeros(100)], axis=0)

In [232]:
dataset['doc2fast_questions'] = [fast_text_mean_transform(question) for question in dataset['questions']]
dataset['doc2fast_questions'][test_id]
dataset['doc2fast_questions'].shape

(5000,)

In [233]:
X_doc2fast = np.array([fast_text_mean_transform(question) for question in dataset['questions']])
le = LabelEncoder()
y_doc2fast = le.fit_transform(dataset['a'])

train_with_svm(X_doc2fast, get_encoded_y('a'))

Fold - 1 - SVM Accuracy Score ->  - 21.20
Fold - 2 - SVM Accuracy Score ->  - 24.80
Fold - 3 - SVM Accuracy Score ->  - 24.80
Fold - 4 - SVM Accuracy Score ->  - 24.80
Fold - 5 - SVM Accuracy Score ->  - 25.00
Fold - 6 - SVM Accuracy Score ->  - 29.60
Fold - 7 - SVM Accuracy Score ->  - 22.00
Fold - 8 - SVM Accuracy Score ->  - 23.40
Fold - 9 - SVM Accuracy Score ->  - 25.20
Fold - 10 - SVM Accuracy Score ->  - 22.40
Mean Accuracy 24.32 
Std Accuracy 2.22


Best accuracy : 29.599999999999998
Precision: 0.000
Recall: 0.000
F-Measure: nan
Accuracy score: 29.600

Confustion matrix: 
[[  0   0   0  59   0   0   0]
 [  0   0   0  14   0   0   0]
 [  0   0   0  50   0   0   0]
 [  0   0   0 148   0   0   0]
 [  0   0   0 110   0   0   0]
 [  0   0   0 101   0   0   0]
 [  0   0   0  18   0   0   0]]


  _warn_prf(average, modifier, msg_start, len(result))


### Download already trained model - this gave better results compaired to the previous

In [None]:
import fasttext.util
fasttext.util.download_model('en', if_exists='ignore')  # English
ft = fasttext.load_model('cc.en.300.bin')

In [238]:
def get_embedding_sentence(sentence):
    
    embedding=ft.get_sentence_vector(sentence)
    return embedding

### Embedding Accuracy Coarse

In [241]:
X_doc2fast = np.array([get_embedding_sentence(question) for question in dataset['clean_questions'].values])
le = LabelEncoder()
y_doc2fast = le.fit_transform(dataset['a'])

train_with_svm(X_doc2fast, get_encoded_y('a'))

Fold - 1 - SVM Accuracy Score ->  - 72.80
Fold - 2 - SVM Accuracy Score ->  - 75.40
Fold - 3 - SVM Accuracy Score ->  - 72.60
Fold - 4 - SVM Accuracy Score ->  - 72.60
Fold - 5 - SVM Accuracy Score ->  - 74.60
Fold - 6 - SVM Accuracy Score ->  - 75.40
Fold - 7 - SVM Accuracy Score ->  - 71.40
Fold - 8 - SVM Accuracy Score ->  - 71.80
Fold - 9 - SVM Accuracy Score ->  - 76.60
Fold - 10 - SVM Accuracy Score ->  - 75.00
Mean Accuracy 73.82 
Std Accuracy 1.69


Best accuracy : 76.6
Precision: 85.484
Recall: 60.227
F-Measure: 70.667
Accuracy score: 76.600

Confustion matrix: 
[[ 0  0  0  0  0  0  0  1  0]
 [ 0 50  0  0 11  0  1  8  1]
 [ 0  1  3  2  1  0  1  9  0]
 [ 0  1  1 44  3  0  0  5  0]
 [ 0  3  0  1 99  0  3 20  0]
 [ 0  0  0  0  1  0  0  0  0]
 [ 0  1  0  0 11  0 77  6  0]
 [ 0  2  0  1 14  0  5 99  0]
 [ 0  0  0  0  1  0  1  1 11]]


### Embedding Accuracy Fine

In [242]:
train_with_svm(X_doc2fast, get_encoded_y('b'))

Fold - 1 - SVM Accuracy Score ->  - 38.80
Fold - 2 - SVM Accuracy Score ->  - 41.40
Fold - 3 - SVM Accuracy Score ->  - 41.40
Fold - 4 - SVM Accuracy Score ->  - 36.80
Fold - 5 - SVM Accuracy Score ->  - 35.80
Fold - 6 - SVM Accuracy Score ->  - 38.80
Fold - 7 - SVM Accuracy Score ->  - 34.80
Fold - 8 - SVM Accuracy Score ->  - 36.00
Fold - 9 - SVM Accuracy Score ->  - 37.60
Fold - 10 - SVM Accuracy Score ->  - 38.20
Mean Accuracy 37.96 
Std Accuracy 2.12


Best accuracy : 41.4
Precision: 0.000
Recall: 0.000
F-Measure: nan
Accuracy score: 41.400

Confustion matrix: 
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Part 3 - A NN classifier s.a. an LSTM for classification

In [243]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 5000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 25
# This is fixed.
EMBEDDING_DIM = 160
epochs = 10
batch_size = 64

In [244]:
from tensorflow.keras.preprocessing.text import Tokenizer

# tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, split=' ')
tokenizer.fit_on_texts(dataset['questions'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 5582 unique tokens.


In [245]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

X = tokenizer.texts_to_sequences(dataset['questions'].values)
print(X[0])
X_lstm = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X_lstm.shape)


def get_dummi_y(_type='a'):
    y = pd.get_dummies(dataset[_type]).values
    print('Shape of label tensor:', y.shape)
    return y




[4, 7, 2, 321, 105, 31, 1837, 17, 68, 9, 20, 71, 6, 194, 48, 32, 22, 376, 111]
Shape of data tensor: (5000, 25)


In [253]:
from tensorflow.keras.layers import Embedding, SpatialDropout1D, LSTM, Dense
from tensorflow.keras import Sequential

def get_lstm_model(X, y, verbose=0):
    model = Sequential()
    model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
    model.add(SpatialDropout1D(0.2))
    model.add(LSTM(196, dropout=0.2, recurrent_dropout=0.2))
    # model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(y.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    if verbose == 1:
        print(model.summary())
    return model

get_lstm_model(X_lstm, get_dummi_y('b'), 1)

Shape of label tensor: (5000, 79)
Model: "sequential_33"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_33 (Embedding)     (None, 25, 160)           800000    
_________________________________________________________________
spatial_dropout1d_33 (Spatia (None, 25, 160)           0         
_________________________________________________________________
lstm_33 (LSTM)               (None, 196)               279888    
_________________________________________________________________
dense_31 (Dense)             (None, 79)                15563     
Total params: 1,095,451
Trainable params: 1,095,451
Non-trainable params: 0
_________________________________________________________________
None


<tensorflow.python.keras.engine.sequential.Sequential at 0x7f794c43cfd0>

In [201]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X_lstm,y_nn, test_size = 0.10, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(4500, 25) (4500, 10)
(500, 25) (500, 10)


In [256]:
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import KFold

epochs = 10
batch_size = 64

def train_LSTM(X, y):
    best_prediction = None
    best_test = None
    best_accuracy = 0

    cv = KFold(n_splits=10, random_state=1, shuffle=True)
    fold = 0
    accuracies = []
    for train_index, test_index in cv.split(X):
        fold += 1
        print("FOLD {}".format(fold))
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model = get_lstm_model(X, y, 0)

        hist = model.fit(X_train, y_train, verbose=0, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])
        predictions = model.predict(X_test)
        for item in hist.history.items():
            mean_val = np.mean(item[1])
            if item[0] == 'accuracy':
                acc = mean_val
            print("Mean {} : {}".format(item[0], mean_val))
        
        
        print("\n")
        if best_accuracy < acc:
            best_accuracy = acc
            best_prediction = predictions
            best_test = y_test
            
    fine_pred = [np.argmax(p) for p in best_prediction]
    fine_gt = [np.argmax(p) for p in best_test]
    PRC_matrics(fine_pred, fine_gt)

### LSTM Accuracy Coarse

In [225]:
train_LSTM(X_lstm, get_dummi_y('a'))

Shape of label tensor: (5000, 10)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Mean loss : 0.6174741334148816
Mean accuracy : 0.7800705560616085
Mean val_loss : 1.0461519019944328
Mean val_accuracy : 0.6441269921404975



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Mean loss : 0.6252083576151303
Mean accuracy : 0.7737213373184204
Mean val_loss : 1.0879763194492884
Mean val_accuracy : 0.6511111089161464



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Mean loss : 0.5357442414388061
Mean accuracy : 0.8099382668733597
Mean val_loss : 1.068048857152462
Mean val_accuracy : 0.6444444414228201



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Mean loss : 0.5495527326129377
Mean accuracy : 0.8051851838827133
Mean val_loss : 1.0520197451114655
Mean val_accuracy : 0.6397222150117159



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Mean loss : 0.49600091204047203
Mean accuracy : 0.8212620152367486
Mean val_loss : 1.0225619938638475
Mean val_accuracy : 0.6570370462205675



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Mean loss : 0.4784498129867845
Mean accuracy : 0.8288614584339989
Mean val_loss : 0.9568546149465773
Mean val_accuracy : 0.6659259266323514



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Mean loss : 0.5371598335914314
Mean accuracy : 0.8068518452346325
Mean val_loss : 1.0739071443676949
Mean val_accuracy : 0.6222222223877907



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Mean loss : 0.6014089845120907
Mean accuracy : 0.784475315362215
Mean val_loss : 1.1431562080979347
Mean val_accuracy : 0.6116666626185179



Precision: 77.215
Recall: 81.333
F-Measure: 79.221
Acc

### LSTM Accuracy Fine

In [255]:
train_LSTM(X_lstm, get_dummi_y('b'))

Shape of label tensor: (5000, 79)
Model: "sequential_34"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_34 (Embedding)     (None, 25, 160)           800000    
_________________________________________________________________
spatial_dropout1d_34 (Spatia (None, 25, 160)           0         
_________________________________________________________________
lstm_34 (LSTM)               (None, 196)               279888    
_________________________________________________________________
dense_32 (Dense)             (None, 79)                15563     
Total params: 1,095,451
Trainable params: 1,095,451
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Mean loss : 1.9871169924736023
Mean accuracy : 0.5100740700960159
Mean val_loss : 2.5803524

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Mean loss : 1.8628714382648468
Mean accuracy : 0.5448641978204251
Mean val_loss : 2.567262887954712
Mean val_accuracy : 0.41533333361148833



Model: "sequential_38"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_38 (Embedding)     (None, 25, 160)           800000    
_________________________________________________________________
spatial_dropout1d_38 (Spatia (None, 25, 160)           0         
_________________________________________________________________
lstm_38 (LSTM)               (None, 196)               279888    
_________________________________________________________________
dense_36 (Dense)             (None, 79)                15563     
Total params: 1,095,451
Trainable params: 1,095,451
Non-trainable params: 0
________________________________________________________________

Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Mean loss : 1.8638234287500381
Mean accuracy : 0.5399506136775016
Mean val_loss : 2.660208749771118
Mean val_accuracy : 0.4020000033080578



Model: "sequential_41"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_41 (Embedding)     (None, 25, 160)           800000    
_________________________________________________________________
spatial_dropout1d_41 (Spatia (None, 25, 160)           0         
_________________________________________________________________
lstm_41 (LSTM)               (None, 196)               279888    
_________________________________________________________________
dense_39 (Dense)             (None, 79)                15563     
Total params: 1,095,451
Trainable params: 1,095,451
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/10
Epoch

Epoch 8/10
Epoch 9/10
Mean loss : 2.0770145853360495
Mean accuracy : 0.4894101512100961
Mean val_loss : 2.6513061788347034
Mean val_accuracy : 0.39061728450987077



Precision: 0.000
Recall: 0.000
F-Measure: nan
Accuracy score: 57.000

Confustion matrix: 
[[ 2  0  0 ...  0  0  0]
 [ 0  4  1 ...  0  0  0]
 [ 0  0 23 ...  0  0  0]
 ...
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  1  0]
 [ 0  0  0 ...  0  0  0]]




# Part 4. BONUS - experiment with a BERT-based classifier

In [257]:
import tensorflow as tf

from tensorflow.keras import layers
import bert

In [258]:
# Create a bert tockenizer
import tensorflow_hub as hub
import os
os.environ['TFHUB_DOWNLOAD_PROGRESS'] = "1"
os.environ["TFHUB_CACHE_DIR"] = "/tmp/model"

BertTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False)
vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertTokenizer(vocabulary_file, to_lower_case)

Downloaded https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1, Total size: 423.26MB



In [259]:
def tokenize_reviews(text_reviews):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text_reviews))

In [260]:
tokenized_questions = [tokenize_reviews(qu) for qu in dataset['questions'].values]

In [262]:
bret_y_corse = get_encoded_y('a')
bret_y_fine = get_encoded_y('b')

bert_train_dataset_c = [[question, bret_y_corse[i], len(question)] for i, question in enumerate(tokenized_questions)]
bert_train_dataset_f = [[question, bret_y_fine[i], len(question)] for i, question in enumerate(tokenized_questions)]

In [263]:
import random
random.shuffle(bert_train_dataset)
sorted_bert_train_dataset_c = [(bert_train[0], bert_train[1]) for bert_train in bert_train_dataset_c]
sorted_bert_train_dataset_f = [(bert_train[0], bert_train[1]) for bert_train in bert_train_dataset_f]

In [265]:
import tensorflow as tf

processed_dataset_c = tf.data.Dataset.from_generator(lambda: sorted_bert_train_dataset_c, output_types=(tf.int32, tf.int32))
processed_dataset_f = tf.data.Dataset.from_generator(lambda: sorted_bert_train_dataset_f, output_types=(tf.int32, tf.int32))

In [266]:
BATCH_SIZE = 32
batched_dataset_c = processed_dataset_c.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))
batched_dataset_f = processed_dataset_f.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))

In [267]:
class TEXT_MODEL(tf.keras.Model):
    
    def __init__(self,
                 vocabulary_size,
                 embedding_dimensions=128,
                 cnn_filters=50,
                 dnn_units=512,
                 model_output_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="text_model"):
        super(TEXT_MODEL, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocabulary_size,
                                          embedding_dimensions)
        self.cnn_layer1 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=2,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer2 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=3,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer3 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=4,
                                        padding="valid",
                                        activation="relu")
        self.pool = layers.GlobalMaxPool1D()
        
        self.dense_1 = layers.Dense(units=dnn_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if model_output_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=model_output_classes,
                                           activation="softmax")
    
    def call(self, inputs, training):
        l = self.embedding(inputs)
        l_1 = self.cnn_layer1(l) 
        l_1 = self.pool(l_1) 
        l_2 = self.cnn_layer2(l) 
        l_2 = self.pool(l_2)
        l_3 = self.cnn_layer3(l)
        l_3 = self.pool(l_3) 
        
        concatenated = tf.concat([l_1, l_2, l_3], axis=-1) # (batch_size, 3 * cnn_filters)
        concatenated = self.dense_1(concatenated)
        concatenated = self.dropout(concatenated, training)
        model_output = self.last_dense(concatenated)
        
        return model_output

In [310]:
VOCAB_LENGTH = len(tokenizer.vocab)
EMB_DIM = 200
CNN_FILTERS = 100
DNN_UNITS = 256
OUTPUT_CLASSES = 10
DROPOUT_RATE = 0.2
NB_EPOCHS = 10

In [311]:
def create_and_compile_bert(output_classes):
    text_model = TEXT_MODEL(vocabulary_size=VOCAB_LENGTH,
                            embedding_dimensions=EMB_DIM,
                            cnn_filters=CNN_FILTERS,
                            dnn_units=DNN_UNITS,
                            model_output_classes=output_classes,
                            dropout_rate=DROPOUT_RATE)
    text_model.compile(loss="sparse_categorical_crossentropy",
                       optimizer="adam",
                       metrics=["sparse_categorical_accuracy"])
    
    return text_model


In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

import math
best_test = None
def train_BERT(batched_dataset, output_classes):
    
    test_accuracies = []
    best_prediction = None
    best_test = None
    best_accuracy = 0
    acc =0
    best_model = None
    
    TOTAL_BATCHES = math.ceil(len(sorted_bert_train_dataset_c) / BATCH_SIZE)
    TEST_BATCHES = TOTAL_BATCHES // 10
    batched_dataset.shuffle(TOTAL_BATCHES)

    cv = KFold(n_splits=10, random_state=1, shuffle=True)
    fold = 0
    accuracies = []
    for i in range(10):
        fold += 1
        print("FOLD {}".format(fold))
        text_model = create_and_compile_bert(output_classes)

        # shuffel and take 10 batches
        batched_dataset.shuffle(TOTAL_BATCHES)
        test_data = batched_dataset.take(TEST_BATCHES)
        train_data = batched_dataset.skip(TEST_BATCHES)


        hist = text_model.fit(train_data, epochs=NB_EPOCHS, verbose=0)
#         results = text_model.evaluate(test_data)
#         print(results)
        for item in hist.history.items():
            mean_val = np.mean(item[1])
            if item[0] == 'sparse_categorical_accuracy':
                acc = mean_val
            print("Mean {} : {}".format(item[0], mean_val))
        results = text_model.evaluate(test_data)
        acc = results[1]*100
        print("Test Accuracy: {}\n\n".format(acc)
        test_accuracies.append(acc)
        if best_accuracy < acc:
            best_accuracy = acc
            best_prediction = text_model.predict(test_data)
            best_test = test_data
            best_model = text_model
    
    print("Overall test accuracy: {}".format(np.mean(test_accuracies)))
    return best_model
#     print(best_test)
#     bert_pred = [np.argmax(p) for p in best_prediction]
#     bert_gt = [np.argmax(p) for p in best_test]
#     PRC_matrics(bert_pred, bert_gt)
            

In [322]:
text_model = train_BERT(batched_dataset_c, 10)

FOLD 1
Mean loss : 0.2827250716043636
Mean sparse_categorical_accuracy : 0.905951327085495
Test Accuracy: 78.75000238418579


FOLD 2
Mean loss : 0.2709576961584389
Mean sparse_categorical_accuracy : 0.9097123891115189
Test Accuracy: 78.75000238418579


FOLD 3
Mean loss : 0.27320860591717067
Mean sparse_categorical_accuracy : 0.9076327443122864
Test Accuracy: 77.49999761581421


FOLD 4
Mean loss : 0.2798503952100873
Mean sparse_categorical_accuracy : 0.9070132791996002
Test Accuracy: 77.49999761581421


FOLD 5
Mean loss : 0.28253342108801005
Mean sparse_categorical_accuracy : 0.9055088490247727
Test Accuracy: 78.33333611488342


FOLD 6
Mean loss : 0.2730441984720528
Mean sparse_categorical_accuracy : 0.908672571182251
Test Accuracy: 78.33333611488342


FOLD 7
Mean loss : 0.28220053799450395
Mean sparse_categorical_accuracy : 0.9063938051462174
Test Accuracy: 78.54166626930237


FOLD 8
Mean loss : 0.27858771923929454
Mean sparse_categorical_accuracy : 0.9060840725898742
Test Accuracy: 77

In [323]:
train_BERT(batched_dataset_f, 79)

FOLD 1
Mean loss : 1.053017887659371
Mean sparse_categorical_accuracy : 0.7568362832069397
Test Accuracy: 59.375


FOLD 2
Mean loss : 1.06547115072608
Mean sparse_categorical_accuracy : 0.751703542470932
Test Accuracy: 57.70833492279053


FOLD 3
Mean loss : 1.0485209930688142
Mean sparse_categorical_accuracy : 0.7557522162795067
Test Accuracy: 57.499998807907104


FOLD 4
Mean loss : 1.0588902793824673
Mean sparse_categorical_accuracy : 0.7546902634203434
Test Accuracy: 58.541667461395264


FOLD 5
Mean loss : 1.05250611230731
Mean sparse_categorical_accuracy : 0.7565044283866882
Test Accuracy: 58.125001192092896


FOLD 6
Mean loss : 1.0637003231793642
Mean sparse_categorical_accuracy : 0.7517477795481682
Test Accuracy: 62.29166388511658


FOLD 7
Mean loss : 1.044200337678194
Mean sparse_categorical_accuracy : 0.7570575274527073
Test Accuracy: 59.79166626930237


FOLD 8
Mean loss : 1.0906424306333065
Mean sparse_categorical_accuracy : 0.748407082259655
Test Accuracy: 58.541667461395264



<__main__.TEXT_MODEL at 0x7f76642aa890>

In [324]:
# prediction = text_model.predict(test_data)
# print(prediction)

In [46]:
bert_pred = [np.argmax(p) for p in predictions]
bert_gt = [np.argmax(p) for p in y_test]
PRC_matrics(bert_pred, bert_gt)

Precision: 76.768
Recall: 89.412
F-Measure: 82.609
Accuracy score: 81.400
Confustion matrix: 
[[65  0  1  3  1  1  0]
 [ 0 11  1  1  0  1  0]
 [ 1  1 52  2  0  0  0]
 [ 2  1  1 84  4  9  1]
 [ 0  0  0 10 82  4  0]
 [ 8 10  5 12 11 99  2]
 [ 0  0  0  0  0  0 14]]


(76.76767676767676, 89.41176470588236, 82.60869565217392, 81.39999999999999)