In [121]:
import nltk
import json
import gensim
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from matplotlib import pyplot as plt
from sklearn import preprocessing

def get_df(oos=True,domains=False) :
    with open('data_full.json') as json_file: 
        data_dict = json.load(json_file) 

    train_data = data_dict['train']
    val_data = data_dict['val']
    test_data = data_dict['test']

    oos_train = data_dict['oos_train']
    oos_val = data_dict['oos_val']
    oos_test = data_dict['oos_test']


    train_df = pd.DataFrame(train_data, columns =['query', 'intent'])
    val_df = pd.DataFrame(val_data, columns =['query', 'intent'])
    test_df = pd.DataFrame(test_data, columns =['query', 'intent'])

    train_oos_df = pd.DataFrame(oos_train,columns=['query','intent'])
    val_oos_df = pd.DataFrame(oos_val,columns=['query','intent'])
    test_oos_df = pd.DataFrame(oos_test,columns=['query','intent'])

    if oos :
        # Concatenate dataframes to consider oos as a specific intent
        train_df = pd.concat([train_df,train_oos_df])
        val_df = pd.concat([val_df,val_oos_df])
        test_df = pd.concat([test_df,test_oos_df])
    
    train_df =pd.concat([train_df,val_df])

    if domains:
        with open('domains.json') as json_file:
            domain_dict = json.load(json_file)
        inv_domain_dict = {}
        for domainKey in domain_dict.keys():
            for intent in domain_dict[domainKey]:
                inv_domain_dict[intent] = domainKey
        if oos:
            inv_domain_dict['oos']='oos'
        train_df['domain'] = train_df.apply(lambda row: inv_domain_dict[row['intent']],axis=1)
        test_df['domain'] = test_df.apply(lambda row: inv_domain_dict[row['intent']],axis=1)
    
    return train_df, test_df


df_train, df_test = get_df(oos=False,domains=True)




In [122]:
# Preprocessing

def utils_preprocess_text(text, flg_stemm=True, flg_lemm=False, lst_stopwords=None):
            
    ## Tokenize (convert from string to list)
    lst_text = text.split()
    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in 
                    lst_stopwords]
                
    ## Stemming 
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
                
    ## Lemmatisation 
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
            
    ## back to string from list
    text = " ".join(lst_text)
    return text

lst_stopwords = nltk.corpus.stopwords.words("english")

In [123]:
# Apply preprocessing
df_train["query_clean"] = df_train["query"].apply(lambda x: utils_preprocess_text(x, flg_stemm=False, flg_lemm=True))
df_test["query_clean"] = df_test["query"].apply(lambda x: utils_preprocess_text(x, flg_stemm=False, flg_lemm=True))

In [124]:
def corpus_as_lst(corpus):
   ## create list of lists of unigrams
   lst_corpus = []
   for string in corpus:
      lst_words = string.split()
      lst_grams = [" ".join(lst_words[i:i+1]) for i in range(0, len(lst_words), 1)]
      lst_corpus.append(lst_grams)
   return lst_corpus

# Prepare the corpus to be trained by Word2Vec
train_corpus = corpus_as_lst(df_train['query_clean'])


In [125]:
# Training word embeddings
wc_model = gensim.models.word2vec.Word2Vec(train_corpus, vector_size=300,   window=8, min_count=1, sg=1, epochs=30)


In [116]:
def text_to_mean_vector(embeddings, text):
    tokens = text.split()
    vec = []
    for i in range(len(tokens)):
        try:
            vec.append(embeddings.get_vector(tokens[i]))
        except KeyError:
            True   # simply ignore out-of-vocabulary tokens
    if(len(vec)!=0):
        return [sum([row[j] for row in vec]) / len(vec) for j in range(len(vec[0]))]
    else : 
        return []

def get_word_embdeddings(lst_corpus, model):
    embeddings_corpus = []
    for c in lst_corpus:
        mean_vec = text_to_mean_vector(model.wv, c)
        if(len(mean_vec)!=0):
            embeddings_corpus.append(mean_vec)
        else:
            embeddings_corpus.append(np.zeros(model.wv.vector_size,))
    return np.array(embeddings_corpus)


In [117]:
# Extracting word embeddings
X_train = get_word_embdeddings(df_train['query_clean'], wc_model)
X_test = get_word_embdeddings(df_test['query_clean'], wc_model)

print(X_train.shape)
print(X_test.shape)


(18200, 300)
(5500, 300)


In [118]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

def train_model(model,X_train,df_train,df_test,domains=False):
        # Getting labels
    if domains:
        y_train = df_train['domain'].values
        y_test = df_test['domain'].values
    else:
        y_train = df_train['intent'].values
        y_test = df_test['intent'].values
        
    model.fit(X_train, y_train)
    y_pred = lr.predict(X_test)

    return y_pred,y_test

# Training 
lr = LogisticRegression(multi_class='multinomial', max_iter=300)
y_pred,y_test=train_model(lr,X_train,df_train,df_test,domains=True)


In [119]:
# confusion matrix
print('Confusion matrix shape:',confusion_matrix(y_test, y_pred).shape)

# accuracy, precision, recall, f1

accuracy = accuracy_score(y_test,y_pred)
precision = precision_score(y_test,y_pred,average='macro')
recall = recall_score(y_test,y_pred,average='macro')
f1 = f1_score(y_test,y_pred,average='macro')
print('Accuracy',accuracy)
print('Precision',precision)
print('Recall',recall)
print('F1',f1)


Confusion matrix shape: (11, 11)
Accuracy 0.7796363636363637
Precision 0.796376966149679
Recall 0.8502626262626264
F1 0.789790700868814
