In [0]:
import pandas as pd
import numpy as np
import logging
from keras import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.metrics import classification_report

from dictionary import Dictionary
from text_util import Text_Util

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
tfidf = True

In [33]:
# Get dsl train, validation, and test sets and vectorize them
# es-AR es-ES es-PE fr-CA fr-FR
dsl_train = pd.read_csv('./DSL-TRAIN.txt', sep='\t', names=['sentence', 'c'])
# dsl_train.query('c == "es-AR" or c == "es-ES" or c == "es-PE" or c == "fr-FR" or c == "fr-CA"', inplace=True)
# dsl_train.query('c == "es-AR" or c == "es-ES" or c == "es-PE"', inplace=True)
dsl_train.query('c == "fr-FR" or c == "fr-CA"', inplace=True)
dsl_x_train = dsl_train.sentence.values
dsl_y_train = dsl_train.c.values
dsl_test = pd.read_csv('./DSL-DEV.txt', sep='\t', names=['sentence', 'c'])
# dsl_test.query('c == "es-AR" or c == "es-ES" or c == "es-PE" or c == "fr-FR" or c == "fr-CA"', inplace=True)
# dsl_test.query('c == "es-AR" or c == "es-ES" or c == "es-PE"', inplace=True)
dsl_test.query('c == "fr-FR" or c == "fr-CA"', inplace=True)
dsl_x_test = dsl_test.sentence.values
dsl_y_test = dsl_test.c.values
le = LabelEncoder()
dsl_y_train = le.fit_transform(dsl_y_train)
dsl_y_test = le.transform(dsl_y_test)
# *********************************************************************************************
logger.info('### Creating dictionary...')
sorted_labels = np.unique(dsl_y_train)
dic = Dictionary(sorted_labels)
# Preprocessing text
logger.info('### Preprocessing text...')
text_util = Text_Util()
dsl_x_train = text_util.get_preprocessed_tokenized_sentences_dsl(dsl_x_train)
if tfidf:
  # Updating dictionary
  logger.info('### Updating dictionary...')
  for i in range(len(dsl_x_train)):
      dic.update_tokenized(dsl_x_train[i], dsl_y_train[i])
  selected_words = {}
  for i, l in enumerate(sorted_labels):
      u_list = dic.get_n_words_unique_to_label(l, 1000)
      o_list = dic.get_n_top_words_given_label(l, 5000)
      for u in u_list:
          selected_words[u] = True

      for o in o_list:
          selected_words[o] = True

  print(f"Size of selected words set {len(selected_words)}")
  x = []

  for tokenized_comment in dsl_x_train:
      x.append(np.array([w for w in tokenized_comment if w in selected_words]))
  dsl_x_train = x
  dsl_x_train = list(map(" ".join, dsl_x_train))
  # *********************************************************************************************
  vectorizer = TfidfVectorizer()
  dsl_x_train = vectorizer.fit_transform(dsl_x_train)
  dsl_x_test = vectorizer.transform(dsl_x_test)
  dsl_x_train, dsl_x_val, dsl_y_train, dsl_y_val = \
      train_test_split(dsl_x_train, dsl_y_train, test_size=.2, shuffle=True, stratify=dsl_y_train)
  logger.info(f'### dsl_x_train.shape {dsl_x_train.shape}')
  logger.info(f'### dsl_x_train.shape {dsl_x_val.shape}')

dsl_y_train = [item for item in dsl_y_train.astype(str)]
dsl_y_val = [item for item in dsl_y_val.astype(str)]
# dsl_y_test = [item for item in dsl_y_test.astype(str)]
binarizer = MultiLabelBinarizer()
dsl_y_train = binarizer.fit_transform(dsl_y_train)
dsl_y_val = binarizer.transform(dsl_y_val)
# dsl_y_test = binarizer.transform(dsl_y_test)


INFO:__main__:### Creating dictionary...
INFO:__main__:### Preprocessing text...
INFO:__main__:### Updating dictionary...


Size of selected words set 7945


INFO:__main__:### dsl_x_train.shape (28137, 7920)
INFO:__main__:### dsl_x_train.shape (7035, 7920)


In [37]:
if tfidf:
  # Train model
  classifier = Sequential()
  # First Hidden Layer
  classifier.add(Dense(32, activation='relu', kernel_initializer='random_normal', input_dim=dsl_x_train.shape[1]))
  # Dropout
  classifier.add(Dropout(rate=0.1))
  # # Second Hidden Layer
  # classifier.add(Dense(64, activation='relu', kernel_initializer='random_normal'))
  # # Dropout
  # classifier.add(Dropout(rate=0.1))
  # Output Layer
  classifier.add(Dense(2, activation='softmax', kernel_initializer='random_normal'))
  opt = Adam(lr=0.001, beta_1=0.9, beta_2=0.999)
  classifier.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
  # Fitting the data to the training dataset
  classifier.fit(dsl_x_train, dsl_y_train, batch_size=64, epochs=4, validation_data=(dsl_x_val, dsl_y_val))
  # metrics = classifier.evaluate(dsl_x_test, dsl_y_test, verbose=1)
  # for i in range(len(classifier.metrics_names)):
  #     logger.info(f'{classifier.metrics_names[i]}: {metrics[i]}')

dsl_y_pred = classifier.predict(dsl_x_test)
dsl_y_pred = np.argmax(dsl_y_pred, axis=1)
print(classification_report(le.inverse_transform(dsl_y_test), le.inverse_transform(dsl_y_pred)))

Train on 28137 samples, validate on 7035 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
              precision    recall  f1-score   support

       fr-CA       0.90      0.86      0.88      2000
       fr-FR       0.87      0.90      0.88      1990

    accuracy                           0.88      3990
   macro avg       0.88      0.88      0.88      3990
weighted avg       0.88      0.88      0.88      3990

