In [1]:
import pandas as pd
import json
import re
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report,accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier

In [2]:
with open('data_full.json') as json_file: 
    data_dict = json.load(json_file) 

train_data = data_dict['train']
val_data = data_dict['val']
test_data = data_dict['test']

In [3]:
train_df = pd.DataFrame(train_data, columns =['query', 'intent'])
val_df = pd.DataFrame(val_data, columns =['query', 'intent'])
test_df = pd.DataFrame(test_data, columns =['query', 'intent'])

In [4]:
df = train_df.append(val_df).append(test_df)
df.shape

(22500, 2)

In [5]:
sents = list(df["query"])
intents = list(df["intent"])

In [6]:
cleaned_sents = []
lemmatizer = WordNetLemmatizer()
for sent in sents:
    cleaned_sent = re.sub(r'[^ a-z A-Z 0-9]', " ", sent)
    word = nltk.tokenize.word_tokenize(cleaned_sent)
    cleaned_sents.append([lemmatizer.lemmatize(w.lower()) for w in word])

In [7]:
print(cleaned_sents[:5])

[['what', 'expression', 'would', 'i', 'use', 'to', 'say', 'i', 'love', 'you', 'if', 'i', 'were', 'an', 'italian'], ['can', 'you', 'tell', 'me', 'how', 'to', 'say', 'i', 'do', 'not', 'speak', 'much', 'spanish', 'in', 'spanish'], ['what', 'is', 'the', 'equivalent', 'of', 'life', 'is', 'good', 'in', 'french'], ['tell', 'me', 'how', 'to', 'say', 'it', 'is', 'a', 'beautiful', 'morning', 'in', 'italian'], ['if', 'i', 'were', 'mongolian', 'how', 'would', 'i', 'say', 'that', 'i', 'am', 'a', 'tourist']]


In [30]:
tfidf = TfidfVectorizer(max_features=15000, lowercase=False, tokenizer= lambda text : text, ngram_range=(1,2))
cleaned_sents_tfidf = tfidf.fit_transform(cleaned_sents).toarray()

In [31]:
X_train, X_test, y_train, y_test = train_test_split(cleaned_sents_tfidf, intents, test_size=0.30, random_state=42)

In [32]:
mlp_classifier = MLPClassifier()
MLP = MLPClassifier()
trained_classifier = MLP.fit(X_train, y_train)
y_pred = MLP.predict(X_test)
print('classification report of the multi layer perceptron model : \n',classification_report(y_test, y_pred))
print('Accuracy of the multi layer perceptron model : ',accuracy_score(y_test, y_pred))

classification report of the multi layer perceptron model : 
                            precision    recall  f1-score   support

      accept_reservations       0.97      0.94      0.95        33
          account_blocked       0.90      0.90      0.90        49
                    alarm       0.95      1.00      0.98        41
       application_status       0.97      0.95      0.96        40
                      apr       0.95      1.00      0.98        40
            are_you_a_bot       0.98      0.94      0.96        54
                  balance       0.94      0.96      0.95        52
             bill_balance       0.88      0.83      0.86        36
                 bill_due       0.92      0.92      0.92        52
              book_flight       1.00      1.00      1.00        48
               book_hotel       0.95      0.95      0.95        40
               calculator       0.93      0.95      0.94        44
                 calendar       0.85      0.80      0.83        41

# Bonus

In [33]:
nb_classifier = MultinomialNB()
NB = MultinomialNB()
trained_classifier = NB.fit(X_train, y_train)
y_pred = NB.predict(X_test)
print('classification report of the naive bayes model : \n',classification_report(y_test, y_pred))
print('Accuracy of the naive bayes model  : ',accuracy_score(y_test, y_pred))

classification report of the naive bayes model : 
                            precision    recall  f1-score   support

      accept_reservations       0.97      0.94      0.95        33
          account_blocked       0.88      0.92      0.90        49
                    alarm       0.98      1.00      0.99        41
       application_status       0.88      0.93      0.90        40
                      apr       0.87      1.00      0.93        40
            are_you_a_bot       0.91      0.96      0.94        54
                  balance       0.98      0.88      0.93        52
             bill_balance       0.68      0.89      0.77        36
                 bill_due       1.00      0.81      0.89        52
              book_flight       1.00      1.00      1.00        48
               book_hotel       0.90      0.93      0.91        40
               calculator       0.94      0.75      0.84        44
                 calendar       0.80      0.78      0.79        41
          