In [1]:
import pandas as pd
import nltk
import numpy as np

import tensorflow as tf
physical_devices = tf.config.experimental.list_physical_devices('GPU')
assert len(physical_devices) > 0, "Not enough GPU hardware devices available"
# config = tf.config.experimental.set_memory_growth(physical_devices[0], True)
tf.config.experimental.set_virtual_device_configuration(physical_devices[0], [
tf.config.experimental.VirtualDeviceConfiguration(memory_limit=2000)])

In [2]:
file_name = 'data/Travel-Dataset-5000--master/5000TravelQuestionsDataset.xlsx'
test_id  = 1
col_names = ['questions', 'a', 'b']
dataset = pd.read_excel(file_name, header=None, names=col_names)
dataset['questions'].dropna(inplace=True)
print(dataset.info())
print(dataset['a'].unique())
print(dataset['questions'][test_id])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   questions  5000 non-null   object
 1   a          5000 non-null   object
 2   b          5000 non-null   object
dtypes: object(3)
memory usage: 117.3+ KB
None
['TTD' 'TGU' 'ACM' 'TRS' 'WTH' 'FOD' 'ENT' 'TGU\n' 'TTD\n' '\nENT']
What are the companies which organize shark feeding events for scuba divers?


# PreProcessing

In [3]:
import re

def clean_text(text):
    """
    Applies some pre-processing on the given text.

    Steps :
    - Removing HTML tags
    - Removing punctuation
    - Lowering text
    """
    
    # remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # remove the characters [\], ['] and ["]
    text = re.sub(r"\\", "", text)    
    text = re.sub(r"\'", "", text)    
    text = re.sub(r"\"", "", text)    
    
    # convert text to lowercase
    text = text.strip().lower()
    
    # replace punctuation characters with spaces
    filters='!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    translate_dict = dict((c, " ") for c in filters)
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)

    return text

dataset['clean_questions'] = [clean_text(question) for question in dataset['questions']]
dataset['clean_questions'][test_id]

'what are the companies which organize shark feeding events for scuba divers '

In [4]:
# def lowercase(x):
#     return x.lower()

# dataset['questions'] = [clean_text(question) for question in dataset['questions']]
# dataset['lc_questions'][test_id]

In [5]:
# import string

# def remove_punctuation(x):
#     return "".join([char for char in x if char not in string.punctuation])

# dataset['questions'] = [remove_punctuation(question) for question in dataset['questions']]
# dataset['questions'][test_id]

In [6]:
# from nltk.corpus import stopwords
# stop_words = stopwords.words('english')
# from nltk import word_tokenize

# def remove_stopwords(x):
#     words = word_tokenize(x)
#     return " ".join([word for word in words if word not in stop_words])

# dataset['questions'] = [remove_stopwords(question) for question in dataset['questions']]
# dataset['questions'][test_id]
# print(stop_words)

In [7]:
# from nltk.stem.porter import PorterStemmer
# porter = PorterStemmer()
# def stemming(x):
#     filtered_words = word_tokenize(x['questions'])
#     stemmed = [porter.stem(word) for word in filtered_words]
#     return " ".join(stemmed)

# dataset['questions'] = [remove_stopwords(question) for question in dataset['questions']]
# dataset['questions'][test_id]

# Create features

## Lemmatize and create BOW

In [8]:
from nltk.stem import WordNetLemmatizer 
  
lemmatizer = WordNetLemmatizer()
def lemmatize(x):
    filtered_words = nltk.word_tokenize(x)
    lemmatized = [lemmatizer.lemmatize(word) for word in filtered_words]
    return " ".join(lemmatized)

dataset['lem_questions'] = [lemmatize(question) for question in dataset['clean_questions']]
dataset['lem_questions'][test_id]

'what are the company which organize shark feeding event for scuba diver'

## POS Tags

In [9]:
from nltk import pos_tag
def pos_tagging(x):
    words = nltk.word_tokenize(x)
    lst = [ r[1] for r in pos_tag(words)] 
    return ' '.join(lst)

dataset['pos_questions'] = [pos_tagging(question) for question in dataset['questions']]
dataset['pos_questions'][test_id]

'WP VBP DT NNS WDT VBP NN NN NNS IN NN NNS .'

## Naming Entities

In [10]:
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree

def get_continuous_chunks(text):
    chunked = ne_chunk(pos_tag(word_tokenize(text)))
    prev = None
    continuous_chunk = []
    current_chunk = []

    for i in chunked:
        if type(i) == Tree:
            current_chunk.append(" ".join([token for token, pos in i.leaves()]))
        elif current_chunk:
            named_entity = " ".join(current_chunk)
            if named_entity not in continuous_chunk:
                continuous_chunk.append(named_entity)
                current_chunk = []
        else:
            continue

    if continuous_chunk:
        named_entity = " ".join(current_chunk)
        if named_entity not in continuous_chunk:
            continuous_chunk.append(named_entity)
    
    def remove_null(x):
        if '' in x:
            x.remove('')
        return x

    lst = remove_null(continuous_chunk)
    return ' '.join(lst)

txt = "Barack Obama is a great person." 
txt2 = "Who is Dulan?"
print (get_continuous_chunks(txt2))



dataset['ne_questions'] = [get_continuous_chunks(question) for question in dataset['questions']]
dataset['ne_questions'][test_id]

Dulan


''

## Count vectorizer (BOW)

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

def get_count_vect(documents):
    vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
    X = vectorizer.fit_transform(documents).toarray()
    print(X.shape)
    return X

print(get_count_vect(dataset['questions']))

(5000, 1090)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


## Head word feature

In [12]:
# Head word tokenizer
import spacy
nlp = spacy.load("en_core_web_sm")
def head_word_tokenizer(text):
    head_words = []
    for token in nlp(text):
        if token.dep_ == "nsubj" or token.dep_ == "nsubjpass":
            head_words.append(token.text)
            head_words.append(token.head.text)
    return head_words

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

head_words_vectorizer = CountVectorizer(tokenizer = head_word_tokenizer,max_features=100,stop_words=stopwords.words('english'))
head_words_vector = head_words_vectorizer.fit_transform(dataset["questions"].values).toarray()



  'stop_words.' % sorted(inconsistent))


# SVM - Normal train

In [30]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

X, y = dataset['lem_questions'],dataset['a']

tfidf = TfidfVectorizer(max_features=5000)
tfidf.fit(X)

le = LabelEncoder()
le.fit(y)


cv = KFold(n_splits=10, random_state=1, shuffle=True)
fold = 0
accuracies = []
for train_index, test_index in cv.split(X):
    fold += 1
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    y_train_e = le.transform(y_train)
    y_test_e = le.transform(y_test)
    
    X_train_tfidf = tfidf.transform(X_train)
    X_test_tfidf = tfidf.transform(X_test)
    print(X_train_tfidf.shape)
    
    SVM = SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
    SVM.fit(X_train_tfidf,y_train_e)
    predictions_SVM = SVM.predict(X_test_tfidf)
    acc = accuracy_score(predictions_SVM, y_test_e)*100
    accuracies.append(acc)
    print("Fold - {} - {} - {:.2f}".format(fold, "SVM Accuracy Score -> ",acc))
    
print("Mean {:.2f} Std {:.2f}".format(np.mean(accuracies), np.std(accuracies)))


(4500, 5000)
Fold - 1 - SVM Accuracy Score ->  - 83.20
(4500, 5000)
Fold - 2 - SVM Accuracy Score ->  - 84.80
(4500, 5000)
Fold - 3 - SVM Accuracy Score ->  - 81.40
(4500, 5000)
Fold - 4 - SVM Accuracy Score ->  - 81.80
(4500, 5000)
Fold - 5 - SVM Accuracy Score ->  - 83.40
(4500, 5000)
Fold - 6 - SVM Accuracy Score ->  - 84.20
(4500, 5000)
Fold - 7 - SVM Accuracy Score ->  - 83.40
(4500, 5000)
Fold - 8 - SVM Accuracy Score ->  - 84.40
(4500, 5000)
Fold - 9 - SVM Accuracy Score ->  - 82.80
(4500, 5000)
Fold - 10 - SVM Accuracy Score ->  - 82.80
Mean 83.22 Std 1.03


In [31]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

X, y = dataset['pos_questions'],dataset['a']

tfidf = TfidfVectorizer(max_features=5000)
tfidf.fit(X)

le = LabelEncoder()
le.fit(y)


cv = KFold(n_splits=10, random_state=1, shuffle=True)
fold = 0
accuracies = []
for train_index, test_index in cv.split(X):
    fold += 1
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    y_train_e = le.transform(y_train)
    y_test_e = le.transform(y_test)
    
    X_train_tfidf = tfidf.transform(X_train)
    X_test_tfidf = tfidf.transform(X_test)
    print(X_train_tfidf.shape)
    
    SVM = SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
    SVM.fit(X_train_tfidf,y_train_e)
    predictions_SVM = SVM.predict(X_test_tfidf)
    acc = accuracy_score(predictions_SVM, y_test_e)*100
    accuracies.append(acc)
    print("Fold - {} - {} - {:.2f}".format(fold, "SVM Accuracy Score -> ",acc))
    
print("Mean {:.2f} Std {:.2f}".format(np.mean(accuracies), np.std(accuracies)))

(4500, 31)
Fold - 1 - SVM Accuracy Score ->  - 38.80
(4500, 31)
Fold - 2 - SVM Accuracy Score ->  - 40.00
(4500, 31)
Fold - 3 - SVM Accuracy Score ->  - 39.80
(4500, 31)
Fold - 4 - SVM Accuracy Score ->  - 42.20
(4500, 31)
Fold - 5 - SVM Accuracy Score ->  - 41.80
(4500, 31)
Fold - 6 - SVM Accuracy Score ->  - 40.00
(4500, 31)
Fold - 7 - SVM Accuracy Score ->  - 39.40
(4500, 31)
Fold - 8 - SVM Accuracy Score ->  - 37.00
(4500, 31)
Fold - 9 - SVM Accuracy Score ->  - 40.20
(4500, 31)
Fold - 10 - SVM Accuracy Score ->  - 33.80
Mean 39.30 Std 2.29


## Accuracy Evauluation on Different Matrices

In [25]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix


def PRC_matrics(y_test, prediction):
    # calculate prediction
    precision = precision_score(y_test, prediction, labels=[1,2], average='micro')*100
    print('Precision: %.3f' % precision)

    # calculate recall
    recall = recall_score(y_test, prediction, labels=[1,2], average='micro')*100
    print('Recall: %.3f' % recall)
    
    # calculate score
#     score = f1_score(y_test, prediction, average='micro')
    f1_score = 2 * (precision * recall) / (precision + recall)
    print('F-Measure: %.3f' % f1_score)
    
    
    acc = accuracy_score(y_test, prediction)*100
#     tn, fp, fn, tp = confusion_matrix([0, 1, 0, 1], [1, 1, 1, 0]).ravel()
#     acc2 = (tn + tp)*100/(tn + fp + fn + tp)
    print('Accuracy score: %.3f' % acc)
    
    
    cm = confusion_matrix(y_test, prediction)
    print("Confustion matrix: \n{}".format(cm))
    
    return precision, recall, f1_score, acc


# Task 1 - A traditional ML classifier s.a. SVM or Logistic Regression with at least 5  of the features mentioned in the paper.

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

from scipy.sparse import coo_matrix, csr_matrix, hstack

le = LabelEncoder()
y = le.fit_transform(dataset['a'])

X_lem = dataset['lem_questions']
tfidf_lem = TfidfVectorizer(max_features=5000)
tfidf_lem.fit(X_lem)

X_pos = dataset['pos_questions']

X_ne = dataset['ne_questions']

XX = csr_matrix(hstack([tfidf_lem.transform(X_lem) ,get_count_vect(X_pos), get_count_vect(X_ne), head_words_vector]))

XX.shape

(5000, 26)
(5000, 322)


(5000, 5448)

## SVM - Train with features

In [50]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


cv = KFold(n_splits=10, random_state=1, shuffle=True)
fold = 0
accuracies = []
for train_index, test_index in cv.split(XX):
    fold += 1
    X_train, X_test = XX[train_index], XX[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    print(X_train.shape)

    
    SVM = SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
    SVM.fit(X_train,y_train)
    predictions_SVM1 = SVM.predict(X_test)
    acc = accuracy_score(predictions_SVM1, y_test)*100
    accuracies.append(acc)
    print("Fold - {} - {} - {:.2f}".format(fold, "SVM Accuracy Score -> ",acc))
    
print("Mean {:.2f} Std {:.2f}".format(np.mean(accuracies), np.std(accuracies)))

(4500, 5448)
Fold - 1 - SVM Accuracy Score ->  - 78.20
(4500, 5448)
Fold - 2 - SVM Accuracy Score ->  - 80.60
(4500, 5448)
Fold - 3 - SVM Accuracy Score ->  - 78.60
(4500, 5448)
Fold - 4 - SVM Accuracy Score ->  - 80.80
(4500, 5448)
Fold - 5 - SVM Accuracy Score ->  - 80.60
(4500, 5448)
Fold - 6 - SVM Accuracy Score ->  - 81.80
(4500, 5448)
Fold - 7 - SVM Accuracy Score ->  - 80.80
(4500, 5448)
Fold - 8 - SVM Accuracy Score ->  - 78.60
(4500, 5448)
Fold - 9 - SVM Accuracy Score ->  - 79.80
(4500, 5448)
Fold - 10 - SVM Accuracy Score ->  - 77.20
Mean 79.70 Std 1.39


In [52]:
PRC_matrics(y_test, predictions_SVM1)

Precision: 85.882
Recall: 73.737
F-Measure: 79.348
Accuracy score: 77.200
Confustion matrix: 
[[61  0  2  6  0  7  0]
 [ 0 12  1  2  0  8  0]
 [ 3  1 47  4  0  5  0]
 [ 3  1  1 81  9 17  0]
 [ 2  1  0 10 74 11  0]
 [ 1  0  0 13  4 96  0]
 [ 0  0  0  2  0  0 15]]


(85.88235294117646, 73.73737373737373, 79.34782608695652, 77.2)

# Word embeddings

## Gensim Doc2Vec - Using Sentence Vectors

In [40]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(dataset['questions'])]
model = Doc2Vec(documents, vector_size=1000, window=2, min_count=1, workers=4)

def doc2vec(x):
    return np.array(model.infer_vector(x.split(' ')))

In [41]:
doc2vec('hello world').shape

(1000,)

In [42]:
# _id = 3
# vector = model.infer_vector(dataset['questions'][_id].split(' '))
# print(vector)
# print(dataset['questions'][_id])

dataset['doc2vec_questions'] = [doc2vec(question) for question in dataset['questions']]
dataset['doc2vec_questions'][test_id]
dataset['doc2vec_questions'].shape

(5000,)

In [53]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

X_doc2vec = np.array([doc2vec(question) for question in dataset['questions']])
le = LabelEncoder()
y_doc2vec = le.fit_transform(dataset['a'])


cv = KFold(n_splits=10, random_state=1, shuffle=True)
fold = 0
accuracies = []
for train_index, test_index in cv.split(X_doc2vec):
    fold += 1
    X_train, X_test = X_doc2vec[train_index], X_doc2vec[test_index]
    y_train, y_test = y_doc2vec[train_index], y_doc2vec[test_index]
    
    print(X_train.shape)
    print(y_train.shape)
    
    SVM = SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
    SVM.fit(X_train,y_train)
    predictions_SVM = SVM.predict(X_test)
    acc = accuracy_score(predictions_SVM, y_test)*100
    accuracies.append(acc)
    print("Fold - {} - {} - {:.2f}".format(fold, "SVM Accuracy Score -> ",acc))
    PRC_matrics(y_test, predictions_SVM)
    
print("Mean {:.2f} Std {:.2f}".format(np.mean(accuracies), np.std(accuracies)))

(4500, 1000)
(4500,)
Fold - 1 - SVM Accuracy Score ->  - 21.20
Precision: 0.000
Recall: 0.000
F-Measure: nan
Accuracy score: 21.200
Confustion matrix: 
[[  0   0   0  81   0   0   0]
 [  0   0   0  25   0   0   0]
 [  0   0   0  56   0   0   0]
 [  0   0   0 106   0   0   0]
 [  0   0   0  96   0   0   0]
 [  0   0   0 118   0   0   0]
 [  0   0   0  18   0   0   0]]
(4500, 1000)
(4500,)


  _warn_prf(average, modifier, msg_start, len(result))


Fold - 2 - SVM Accuracy Score ->  - 27.00
Precision: 0.000
Recall: 0.000
F-Measure: nan
Accuracy score: 27.000
Confustion matrix: 
[[ 0  0  0 18  0 64  0  0]
 [ 0  0  0  4  0 21  0  0]
 [ 0  0  0 10  0 29  0  0]
 [ 0  0  0 45  0 79  0  0]
 [ 0  0  0 16  0 78  0  0]
 [ 0  0  0 20  0 90  0  0]
 [ 0  0  0  0  0  1  0  0]
 [ 0  0  0  2  0 23  0  0]]
(4500, 1000)
(4500,)


  _warn_prf(average, modifier, msg_start, len(result))


Fold - 3 - SVM Accuracy Score ->  - 26.00
Precision: 0.000
Recall: 0.000
F-Measure: nan
Accuracy score: 26.000
Confustion matrix: 
[[ 0  0  0 13  0  0 59  0]
 [ 0  0  0  3  0  0 13  0]
 [ 0  0  0  7  0  0 45  0]
 [ 0  0  0 31  0  0 94  0]
 [ 0  0  0  0  0  0  1  0]
 [ 0  0  0 13  0  0 85  0]
 [ 0  0  0 21  0  0 99  0]
 [ 0  0  0  1  0  0 15  0]]
(4500, 1000)
(4500,)


  _warn_prf(average, modifier, msg_start, len(result))


Fold - 4 - SVM Accuracy Score ->  - 25.80
Precision: 0.000
Recall: 0.000
F-Measure: nan
Accuracy score: 25.800
Confustion matrix: 
[[ 0  0  0 10  0 60  0]
 [ 0  0  0  3  0 21  0]
 [ 0  0  0  8  0 49  0]
 [ 0  0  0 45  0 79  0]
 [ 0  0  0 24  0 78  0]
 [ 0  0  0 22  0 84  0]
 [ 0  0  0  1  0 16  0]]
(4500, 1000)
(4500,)


  _warn_prf(average, modifier, msg_start, len(result))


Fold - 5 - SVM Accuracy Score ->  - 23.80
Precision: 0.000
Recall: 0.000
F-Measure: nan
Accuracy score: 23.800
Confustion matrix: 
[[ 0  0  0  8  0 49  0]
 [ 0  0  0  5  0 25  0]
 [ 0  0  0 15  0 34  0]
 [ 0  0  0 33  0 92  0]
 [ 0  0  0 25  0 85  0]
 [ 0  0  0 28  0 86  0]
 [ 0  0  0  1  0 14  0]]
(4500, 1000)
(4500,)


  _warn_prf(average, modifier, msg_start, len(result))


Fold - 6 - SVM Accuracy Score ->  - 23.60
Precision: 0.000
Recall: 0.000
F-Measure: nan
Accuracy score: 23.600
Confustion matrix: 
[[  0   0   0  13   0  46   0]
 [  0   0   0   4   0  10   0]
 [  0   0   0   9   0  41   0]
 [  0   0   0  35   0 113   0]
 [  0   0   0  22   0  88   0]
 [  0   0   0  18   0  83   0]
 [  0   0   0   4   0  14   0]]
(4500, 1000)
(4500,)


  _warn_prf(average, modifier, msg_start, len(result))


Fold - 7 - SVM Accuracy Score ->  - 22.00
Precision: 0.000
Recall: 0.000
F-Measure: nan
Accuracy score: 22.000
Confustion matrix: 
[[  0   0   0   0   1   0   0   0]
 [  0   0   0   0  89   0   0   0]
 [  0   0   0   0  24   0   0   0]
 [  0   0   0   0  43   0   0   0]
 [  0   0   0   0 110   0   0   0]
 [  0   0   0   0  98   0   0   0]
 [  0   0   0   0 120   0   0   0]
 [  0   0   0   0  15   0   0   0]]
(4500, 1000)
(4500,)


  _warn_prf(average, modifier, msg_start, len(result))


Fold - 8 - SVM Accuracy Score ->  - 24.40
Precision: 0.000
Recall: 0.000
F-Measure: nan
Accuracy score: 24.400
Confustion matrix: 
[[ 0  0  0 16  0  0 47  0]
 [ 0  0  0  2  0  0 14  0]
 [ 0  0  0 11  0  0 50  0]
 [ 0  0  0 30  0  0 87  0]
 [ 0  0  0  0  0  0  1  0]
 [ 0  0  0 23  0  0 87  0]
 [ 0  0  0 23  0  0 92  0]
 [ 0  0  0  0  0  0 17  0]]
(4500, 1000)
(4500,)


  _warn_prf(average, modifier, msg_start, len(result))


Fold - 9 - SVM Accuracy Score ->  - 26.00
Precision: 0.000
Recall: 0.000
F-Measure: nan
Accuracy score: 26.000
Confustion matrix: 
[[ 0  0  0  0  0  0  0  1  0]
 [ 0  0  0  0 13  0  0 58  0]
 [ 0  0  0  0  2  0  0 15  0]
 [ 0  0  0  0  6  0  0 48  0]
 [ 0  0  0  0 31  0  0 95  0]
 [ 0  0  0  0  0  0  0  1  0]
 [ 0  0  0  0 17  0  0 78  0]
 [ 0  0  0  0 22  0  0 99  0]
 [ 0  0  0  0  2  0  0 12  0]]
(4500, 1000)
(4500,)


  _warn_prf(average, modifier, msg_start, len(result))


Fold - 10 - SVM Accuracy Score ->  - 23.40
Precision: 0.000
Recall: 0.000
F-Measure: nan
Accuracy score: 23.400
Confustion matrix: 
[[ 0  0  0 20  0 56  0]
 [ 0  0  0  5  0 18  0]
 [ 0  0  0 11  0 49  0]
 [ 0  0  0 23  0 89  0]
 [ 0  0  0 19  0 79  0]
 [ 0  0  0 20  0 94  0]
 [ 0  0  0  1  0 16  0]]
Mean 24.32 Std 1.78


  _warn_prf(average, modifier, msg_start, len(result))


In [92]:
y_train[1:10]

array([7, 4, 7, 1, 6, 1, 4, 9, 7])

## FastText

In [54]:
import fasttext
import os
model_name='fasttext_skipgram_travel_questions.bin'

if not os.path.exists(model_name):
    dataset['questions'].to_csv('questions.txt', sep='.', header=False, index=False)
    model_fasttext = fasttext.train_unsupervised('questions.txt', model='skipgram')
    model_fasttext.save_model(model_name)
    print("Model saved as {}".format(model_name))
else:
    print("\"{}\" - model loaded".format(model_name))
    model_fasttext = fasttext.load_model(model_name)

"fasttext_skipgram_travel_questions.bin" - model loaded


In [55]:
model_fasttext.words

len(model_fasttext.get_word_vector("the"))

100

In [56]:
def fast_text_mean_transform(X):
    words = X.split(' ')    
    return np.mean([model_fasttext.get_word_vector(w) for w in words if w in model_fasttext.words]
                    or [np.zeros(100)], axis=0)

In [57]:
def fast_text_first_x_words(X, length):
    from tensorflow.keras.preprocessing.sequence import pad_sequences

    X_lstm = pad_sequences(X, maxlen=length)
    
    words = X.split(' ')    
    return np.mean([model_fasttext.get_word_vector(w) for w in words if w in model_fasttext.words]
                    or [np.zeros(100)], axis=0)

In [59]:
# inp = 'the jaya'
# length = 5
# pad_sequences(inp.split(' '), maxlen=length)

In [None]:
# len(fast_text_mean_transform('dulan jaya'))
# fast_text_mean_transform('the jaya')

In [60]:
dataset['doc2fast_questions'] = [fast_text_mean_transform(question) for question in dataset['questions']]
dataset['doc2fast_questions'][test_id]
dataset['doc2fast_questions'].shape

(5000,)

In [61]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

X_doc2fast = np.array([fast_text_mean_transform(question) for question in dataset['questions']])
le = LabelEncoder()
y_doc2fast = le.fit_transform(dataset['a'])

print(X_doc2fast.shape)

cv = KFold(n_splits=10, random_state=1, shuffle=True)
fold = 0
accuracies = []
for train_index, test_index in cv.split(X_doc2fast):
    fold += 1
    X_train, X_test = X_doc2fast[train_index], X_doc2fast[test_index]
    y_train, y_test = y_doc2fast[train_index], y_doc2fast[test_index]
    
    print(X_train.shape)
    print(y_train.shape)
    
    SVM = SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
    SVM.fit(X_train,y_train)
    predictions_SVM = SVM.predict(X_test)
    acc = accuracy_score(predictions_SVM, y_test)*100
    accuracies.append(acc)
    print("Fold - {} - {} - {:.2f}".format(fold, "SVM Accuracy Score -> ",acc))
    PRC_matrics(y_test, predictions_SVM)
    
print("Mean {:.2f} Std {:.2f}".format(np.mean(accuracies), np.std(accuracies)))

(5000, 100)
(4500, 100)
(4500,)
Fold - 1 - SVM Accuracy Score ->  - 21.20
Precision: 0.000
Recall: 0.000
F-Measure: nan
Accuracy score: 21.200
Confustion matrix: 
[[  0   0   0  81   0   0   0]
 [  0   0   0  25   0   0   0]
 [  0   0   0  56   0   0   0]
 [  0   0   0 105   1   0   0]
 [  0   0   0  95   1   0   0]
 [  0   0   0 118   0   0   0]
 [  0   0   0  18   0   0   0]]
(4500, 100)
(4500,)


  _warn_prf(average, modifier, msg_start, len(result))


Fold - 2 - SVM Accuracy Score ->  - 24.80
Precision: 0.000
Recall: 0.000
F-Measure: nan
Accuracy score: 24.800
Confustion matrix: 
[[  0   0   0  82   0   0   0   0]
 [  0   0   0  25   0   0   0   0]
 [  0   0   0  39   0   0   0   0]
 [  0   0   0 124   0   0   0   0]
 [  0   0   0  94   0   0   0   0]
 [  0   0   0 110   0   0   0   0]
 [  0   0   0   1   0   0   0   0]
 [  0   0   0  25   0   0   0   0]]
(4500, 100)
(4500,)


  _warn_prf(average, modifier, msg_start, len(result))


Fold - 3 - SVM Accuracy Score ->  - 24.80
Precision: 0.000
Recall: 0.000
F-Measure: nan
Accuracy score: 24.800
Confustion matrix: 
[[  0   0   0  72   0   0   0   0]
 [  0   0   0  16   0   0   0   0]
 [  0   0   0  52   0   0   0   0]
 [  0   0   0 124   0   1   0   0]
 [  0   0   0   1   0   0   0   0]
 [  0   0   0  98   0   0   0   0]
 [  0   0   0 120   0   0   0   0]
 [  0   0   0  16   0   0   0   0]]
(4500, 100)
(4500,)


  _warn_prf(average, modifier, msg_start, len(result))


Fold - 4 - SVM Accuracy Score ->  - 24.80
Precision: 0.000
Recall: 0.000
F-Measure: nan
Accuracy score: 24.800
Confustion matrix: 
[[  0   0   0  70   0   0   0]
 [  0   0   0  24   0   0   0]
 [  0   0   0  57   0   0   0]
 [  0   0   0 124   0   0   0]
 [  0   0   0 102   0   0   0]
 [  0   0   0 106   0   0   0]
 [  0   0   0  17   0   0   0]]
(4500, 100)
(4500,)


  _warn_prf(average, modifier, msg_start, len(result))


Fold - 5 - SVM Accuracy Score ->  - 25.00
Precision: 0.000
Recall: 0.000
F-Measure: nan
Accuracy score: 25.000
Confustion matrix: 
[[  0   0   0  57   0   0   0]
 [  0   0   0  30   0   0   0]
 [  0   0   0  49   0   0   0]
 [  0   0   0 125   0   0   0]
 [  0   0   0 110   0   0   0]
 [  0   0   0 114   0   0   0]
 [  0   0   0  15   0   0   0]]
(4500, 100)
(4500,)


  _warn_prf(average, modifier, msg_start, len(result))


Fold - 6 - SVM Accuracy Score ->  - 29.60
Precision: 0.000
Recall: 0.000
F-Measure: nan
Accuracy score: 29.600
Confustion matrix: 
[[  0   0   0  59   0   0   0]
 [  0   0   0  14   0   0   0]
 [  0   0   0  50   0   0   0]
 [  0   0   0 148   0   0   0]
 [  0   0   0 110   0   0   0]
 [  0   0   0 101   0   0   0]
 [  0   0   0  18   0   0   0]]
(4500, 100)
(4500,)


  _warn_prf(average, modifier, msg_start, len(result))


Fold - 7 - SVM Accuracy Score ->  - 22.00
Precision: 0.000
Recall: 0.000
F-Measure: nan
Accuracy score: 22.000
Confustion matrix: 
[[  0   0   0   0   1   0   0   0]
 [  0   0   0   0  89   0   0   0]
 [  0   0   0   0  24   0   0   0]
 [  0   0   0   0  43   0   0   0]
 [  0   0   0   0 110   0   0   0]
 [  0   0   0   0  98   0   0   0]
 [  0   0   0   0 120   0   0   0]
 [  0   0   0   0  15   0   0   0]]
(4500, 100)
(4500,)


  _warn_prf(average, modifier, msg_start, len(result))


Fold - 8 - SVM Accuracy Score ->  - 23.40
Precision: 0.000
Recall: 0.000
F-Measure: nan
Accuracy score: 23.400
Confustion matrix: 
[[  0   0   0  62   0   1   0   0]
 [  0   0   0  16   0   0   0   0]
 [  0   0   0  61   0   0   0   0]
 [  0   0   0 117   0   0   0   0]
 [  0   0   0   1   0   0   0   0]
 [  0   0   0 110   0   0   0   0]
 [  0   0   0 115   0   0   0   0]
 [  0   0   0  17   0   0   0   0]]
(4500, 100)
(4500,)


  _warn_prf(average, modifier, msg_start, len(result))


Fold - 9 - SVM Accuracy Score ->  - 25.20
Precision: 0.000
Recall: 0.000
F-Measure: nan
Accuracy score: 25.200
Confustion matrix: 
[[  0   0   0   0   1   0   0   0   0]
 [  0   0   0   0  71   0   0   0   0]
 [  0   0   0   0  17   0   0   0   0]
 [  0   0   0   0  54   0   0   0   0]
 [  0   0   0   0 126   0   0   0   0]
 [  0   0   0   0   1   0   0   0   0]
 [  0   0   0   0  95   0   0   0   0]
 [  0   0   0   0 121   0   0   0   0]
 [  0   0   0   0  14   0   0   0   0]]
(4500, 100)
(4500,)


  _warn_prf(average, modifier, msg_start, len(result))


Fold - 10 - SVM Accuracy Score ->  - 22.40
Precision: 0.000
Recall: 0.000
F-Measure: nan
Accuracy score: 22.400
Confustion matrix: 
[[  0   0   0  74   2   0   0]
 [  0   0   0  23   0   0   0]
 [  0   0   0  60   0   0   0]
 [  0   0   0 112   0   0   0]
 [  0   0   0  98   0   0   0]
 [  0   0   0 114   0   0   0]
 [  0   0   0  17   0   0   0]]
Mean 24.32 Std 2.22


  _warn_prf(average, modifier, msg_start, len(result))


In [116]:
predictions_SVM[1:10]

array([4, 4, 4, 4, 4, 4, 4, 4, 4])

In [None]:
PRC_matrics(y_test, predictions_SVM)

# Train a word Embedding Layer

# Question 3 - A NN classifier s.a. an LSTM for classification

In [23]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 5000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 25
# This is fixed.
EMBEDDING_DIM = 160
epochs = 10
batch_size = 64

In [15]:
from tensorflow.keras.preprocessing.text import Tokenizer

# tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, split=' ')
tokenizer.fit_on_texts(dataset['questions'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 5582 unique tokens.


In [16]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

X = tokenizer.texts_to_sequences(dataset['questions'].values)
print(X[0])
X_lstm = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X_lstm.shape)

y_nn = pd.get_dummies(dataset['a']).values
print('Shape of label tensor:', y_nn.shape)

X[0]

[4, 7, 2, 321, 105, 31, 1837, 17, 68, 9, 20, 71, 6, 194, 48, 32, 22, 376, 111]
Shape of data tensor: (5000, 25)
Shape of label tensor: (5000, 10)


[4, 7, 2, 321, 105, 31, 1837, 17, 68, 9, 20, 71, 6, 194, 48, 32, 22, 376, 111]

In [17]:
from tensorflow.keras.layers import Embedding, SpatialDropout1D, LSTM, Dense
from tensorflow.keras import Sequential

def get_lstm_model(verbose=0):
    model = Sequential()
    model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X_lstm.shape[1]))
    model.add(SpatialDropout1D(0.2))
    model.add(LSTM(196, dropout=0.2, recurrent_dropout=0.2))
    # model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(10, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    epochs = 5
    batch_size = 64
    
    if verbose == 1:
        print(model.summary())
    return model

get_lstm_model(1)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 25, 160)           800000    
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 25, 160)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 196)               279888    
_________________________________________________________________
dense (Dense)                (None, 10)                1970      
Total params: 1,081,858
Trainable params: 1,081,858
Non-trainable params: 0
_________________________________________________________________
None


<tensorflow.python.keras.engine.sequential.Sequential at 0x7f3e40471790>

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X_lstm,y_nn, test_size = 0.10, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(4500, 25) (4500, 10)
(500, 25) (500, 10)


In [26]:
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import KFold

cv = KFold(n_splits=10, random_state=1, shuffle=True)
fold = 0
accuracies = []
for train_index, test_index in cv.split(X):
    fold += 1
    X_train, X_test = X_lstm[train_index], X_lstm[test_index]
    y_train, y_test = y_nn[train_index], y_nn[test_index]
    
    model = get_lstm_model()
    
    history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])
    predictions = model.predict(X_test)
    
    fine_pred = [np.argmax(p) for p in predictions]
    fine_gt = [np.argmax(p) for p in y_test]

    PRC_matrics(fine_pred, fine_gt)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Precision: 82.075
Recall: 72.500
F-Measure: 76.991
Accuracy score: 81.800
Confustion matrix: 
[[69  0  3  4  0  7  1]
 [ 1 18  6  0  0 10  1]
 [ 1  1 43  1  0  0  0]
 [ 5  1  3 92  4 12  1]
 [ 3  1  1  3 90  5  0]
 [ 2  4  0  4  1 84  2]
 [ 0  0  0  2  1  0 13]]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Precision: 84.112
Recall: 85.714
F-Measure: 84.906
Accuracy score: 83.200
Confustion matrix: 
[[ 75   2   3   3   2   5   0   0]
 [  0  15   0   0   0   0   0   0]
 [  1   0  34   1   0   1   0   0]
 [  5   0   1 108  14  10   0   4]
 [  0   0   0   6  74   2   0   0]
 [  1   8   1   6   4  91   1   2]
 [  0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   1   0  19]]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Precision: 77.273
Recall: 77.273
F-Measure: 77.273
Accuracy score: 79.600
Confustion matrix: 
[[56  0  0  5  

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Precision: 83.562
Recall: 72.619
F-Measure: 77.707
Accuracy score: 83.400
Confustion matrix: 
[[ 51   1   0   9   1   5   1]
 [  2  10   0   0   1   3   0]
 [  2   0  48   2   2   4   0]
 [  3   0   0 115   5  10   0]
 [  0   0   0   8  99   1   1]
 [  1   3   2  14   2  78   0]
 [  0   0   0   0   0   0  16]]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Precision: 73.451
Recall: 87.368
F-Measure: 79.808
Accuracy score: 80.600
Confustion matrix: 
[[ 0  0  0  0  0  0  0  0]
 [ 0 72  0  0  5  1  2  0]
 [ 1  0 11  0  0  0  3  0]
 [ 0  0  6 41  3  0  5  0]
 [ 0  7  3  1 88  6 15  3]
 [ 0  2  0  1  6 88  2  1]
 [ 0  8  4  0  8  3 93  1]
 [ 0  0  0  0  0  0  0 10]]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Precision: 87.342
Recall: 75.000
F-Measure: 80.702
Accuracy score: 80.400
Confustion matrix: 
[[57  0  0  7  1  2  5  0]
 [ 0 12  1  0 

In [72]:
fine_pred = [np.argmax(p) for p in predictions]
fine_gt = [np.argmax(p) for p in y_test]
PRC_matrics(fine_pred, fine_gt)

Precision: 32.323
Recall: 34.409
F-Measure: 3333.333
Accuracy score: 25.000
Confustion matrix: 
[[28  4  6 10  8 12  2]
 [ 1  4  7  5  3  2  1]
 [ 5  2 12 14  9  9  1]
 [15  5 12 34 28 27  2]
 [13  3  9 17 26 25  1]
 [13  4 14 31 20 35  9]
 [ 1  1  0  1  4  4  1]]


# Part 4. BONUS - experiment with a BERT-based classifier

In [27]:
import tensorflow as tf

from tensorflow.keras import layers
import bert

In [28]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(dataset['a'])

In [29]:
# Create a bert tockenizer
import tensorflow_hub as hub
import os
os.environ['TFHUB_DOWNLOAD_PROGRESS'] = "1"

BertTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False)
vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertTokenizer(vocabulary_file, to_lower_case)

Downloaded https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1, Total size: 423.26MB



In [30]:
def tokenize_reviews(text_reviews):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text_reviews))

In [31]:
tokenized_questions = [tokenize_reviews(qu) for qu in dataset['questions'].values]

In [32]:
reviews_with_len = [[question, y[i], len(question)] for i, question in enumerate(tokenized_questions)]

In [33]:
# reviews_with_len[0]

In [34]:
import random
random.shuffle(reviews_with_len)
sorted_reviews_labels = [(review_lab[0], review_lab[1]) for review_lab in reviews_with_len]


In [35]:
import tensorflow as tf

processed_dataset = tf.data.Dataset.from_generator(lambda: sorted_reviews_labels, output_types=(tf.int32, tf.int32))

In [36]:
BATCH_SIZE = 32
batched_dataset = processed_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))

In [37]:
# next(iter(batched_dataset))

In [38]:
class TEXT_MODEL(tf.keras.Model):
    
    def __init__(self,
                 vocabulary_size,
                 embedding_dimensions=128,
                 cnn_filters=50,
                 dnn_units=512,
                 model_output_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="text_model"):
        super(TEXT_MODEL, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocabulary_size,
                                          embedding_dimensions)
        self.cnn_layer1 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=2,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer2 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=3,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer3 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=4,
                                        padding="valid",
                                        activation="relu")
        self.pool = layers.GlobalMaxPool1D()
        
        self.dense_1 = layers.Dense(units=dnn_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if model_output_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=model_output_classes,
                                           activation="softmax")
    
    def call(self, inputs, training):
        l = self.embedding(inputs)
        l_1 = self.cnn_layer1(l) 
        l_1 = self.pool(l_1) 
        l_2 = self.cnn_layer2(l) 
        l_2 = self.pool(l_2)
        l_3 = self.cnn_layer3(l)
        l_3 = self.pool(l_3) 
        
        concatenated = tf.concat([l_1, l_2, l_3], axis=-1) # (batch_size, 3 * cnn_filters)
        concatenated = self.dense_1(concatenated)
        concatenated = self.dropout(concatenated, training)
        model_output = self.last_dense(concatenated)
        
        return model_output

In [39]:
VOCAB_LENGTH = len(tokenizer.vocab)
EMB_DIM = 200
CNN_FILTERS = 100
DNN_UNITS = 256
OUTPUT_CLASSES = 10

DROPOUT_RATE = 0.2

NB_EPOCHS = 10

In [40]:
# text_model = TEXT_MODEL(vocabulary_size=VOCAB_LENGTH,
#                         embedding_dimensions=EMB_DIM,
#                         cnn_filters=CNN_FILTERS,
#                         dnn_units=DNN_UNITS,
#                         model_output_classes=OUTPUT_CLASSES,
#                         dropout_rate=DROPOUT_RATE)

def create_and_compile_bert():
    text_model = TEXT_MODEL(vocabulary_size=VOCAB_LENGTH,
                            embedding_dimensions=EMB_DIM,
                            cnn_filters=CNN_FILTERS,
                            dnn_units=DNN_UNITS,
                            model_output_classes=OUTPUT_CLASSES,
                            dropout_rate=DROPOUT_RATE)
    text_model.compile(loss="sparse_categorical_crossentropy",
                       optimizer="adam",
                       metrics=["sparse_categorical_accuracy"])
    
    return text_model


In [41]:
create_and_compile_bert()

<__main__.TEXT_MODEL at 0x7f3d1e794910>

In [42]:
type(batched_dataset)

tensorflow.python.data.ops.dataset_ops.PaddedBatchDataset

In [43]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

import math
TOTAL_BATCHES = math.ceil(len(sorted_reviews_labels) / BATCH_SIZE)
TEST_BATCHES = TOTAL_BATCHES // 10
batched_dataset.shuffle(TOTAL_BATCHES)
    
cv = KFold(n_splits=10, random_state=1, shuffle=True)
fold = 0
accuracies = []
for i in range(10):
    fold += 1
    
    text_model = create_and_compile_bert()
    
    # shuffel and take 10 batches
    batched_dataset.shuffle(TOTAL_BATCHES)
    test_data = batched_dataset.take(TEST_BATCHES)
    train_data = batched_dataset.skip(TEST_BATCHES)
    

    text_model.fit(train_data, epochs=NB_EPOCHS)
    results = text_model.evaluate(test_data)
    print(results)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[0.9278545379638672, 0.8166666626930237]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[0.852890133857727, 0.8208333253860474]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[0.9204831719398499, 0.8104166388511658]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[0.9666312336921692, 0.8145833611488342]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[0.8528225421905518, 0.8145833611488342]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[0.896523654460907, 0.8041666746139526]
Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[0.9065108895301819, 0.8125]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[0.9411752223968506, 0.8166666626930237]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[0.8994573950767517, 0.8145833611488342]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[0.8370484709739685, 0.8187500238418579]


In [44]:
np.array(test_data)[0]

IndexError: too many indices for array

In [45]:
prediction = text_model.predict(test_data)
print(prediction)

[[1.3041423e-03 4.4251319e-06 4.9158145e-05 ... 1.1670458e-01
  2.1933261e-05 3.3491149e-05]
 [1.8490636e-04 6.0688013e-05 3.4000166e-02 ... 6.0006059e-06
  1.0600809e-07 1.7102753e-08]
 [1.0674372e-03 1.1785355e-04 3.4212761e-03 ... 2.1440899e-03
  4.7785823e-05 8.0390555e-06]
 ...
 [4.4226599e-07 5.0554455e-08 1.8028316e-09 ... 1.2207782e-06
  1.9355628e-08 2.3310758e-09]
 [9.5180400e-07 9.9906546e-01 6.4753972e-06 ... 9.4981534e-09
  6.7950418e-10 1.4547613e-10]
 [4.3045085e-07 7.5500552e-08 2.4676950e-07 ... 9.9999833e-01
  4.7554341e-08 3.7406920e-07]]


In [46]:
bert_pred = [np.argmax(p) for p in predictions]
bert_gt = [np.argmax(p) for p in y_test]
PRC_matrics(bert_pred, bert_gt)



Precision: 76.768
Recall: 89.412
F-Measure: 82.609
Accuracy score: 81.400
Confustion matrix: 
[[65  0  1  3  1  1  0]
 [ 0 11  1  1  0  1  0]
 [ 1  1 52  2  0  0  0]
 [ 2  1  1 84  4  9  1]
 [ 0  0  0 10 82  4  0]
 [ 8 10  5 12 11 99  2]
 [ 0  0  0  0  0  0 14]]


(76.76767676767676, 89.41176470588236, 82.60869565217392, 81.39999999999999)