In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import ast
%matplotlib inline


In [19]:
df = pd.read_csv('data/atis_intents.csv')
df.head()


Unnamed: 0,intent,text
0,atis_flight,i want to fly from boston at 838 am and arriv...
1,atis_flight,what flights are available from pittsburgh to...
2,atis_flight_time,what is the arrival time in san francisco for...
3,atis_airfare,cheapest airfare from tacoma to orlando
4,atis_airfare,round trip fares from pittsburgh to philadelp...


In [20]:
train = df.sample(frac = 0.8)
test = df.drop(train.index)

In [21]:
labels = list(set(sum([s.split('#') for s in list(train.intent.drop_duplicates())], [])))
labels


['atis_flight_no',
 'atis_city',
 'atis_ground_fare',
 'atis_meal',
 'atis_flight',
 'atis_abbreviation',
 'atis_aircraft',
 'atis_flight_time',
 'atis_restriction',
 'atis_quantity',
 'atis_capacity',
 'atis_airport',
 'atis_airfare',
 'atis_distance',
 'atis_ground_service',
 'atis_airline']

In [22]:
tmp = pd.DataFrame({'intent': ['atis_aircraft'], 'text': ['abc']})


In [23]:
tmp.loc[0, 'intent']


'atis_aircraft'

In [24]:
for i in range(len(tmp)):
    d = {}
    intents = tmp.loc[i, 'intent']
    for label in labels:
        d[label] = (label in intents.split('#'))


In [25]:
d


{'atis_flight_no': False,
 'atis_city': False,
 'atis_ground_fare': False,
 'atis_meal': False,
 'atis_flight': False,
 'atis_abbreviation': False,
 'atis_aircraft': True,
 'atis_flight_time': False,
 'atis_restriction': False,
 'atis_quantity': False,
 'atis_capacity': False,
 'atis_airport': False,
 'atis_airfare': False,
 'atis_distance': False,
 'atis_ground_service': False,
 'atis_airline': False}

In [26]:
import spacy
import random
from spacy.training import Example
from spacy.pipeline.textcat_multilabel import DEFAULT_MULTI_TEXTCAT_MODEL


In [27]:
nlp = spacy.load("en_core_web_md")

config = {
   "threshold": 0.5,
   "model": DEFAULT_MULTI_TEXTCAT_MODEL
}

textcat = nlp.add_pipe("textcat_multilabel", config=config)


In [28]:
list(train.iterrows())[0]


(2698,
 intent                                          atis_flight
 text       okay what flights are there us air from orlan...
 Name: 2698, dtype: object)

In [40]:
list(train.iterrows())[0][0]

2698

In [47]:
train_examples = []

for index, row in train.iterrows():
    text = row["text"]
    d_labels = {}
    intents = train.loc[index, 'intent']
    for label in labels:
        d_labels[label] = (label in intents.split('#'))
    train_examples.append(Example.from_dict(nlp.make_doc(text), {"cats": d_labels}))


In [48]:
train_examples[0]


{'doc_annotation': {'cats': {'atis_flight_no': False, 'atis_city': False, 'atis_ground_fare': False, 'atis_meal': False, 'atis_flight': True, 'atis_abbreviation': False, 'atis_aircraft': False, 'atis_flight_time': False, 'atis_restriction': False, 'atis_quantity': False, 'atis_capacity': False, 'atis_airport': False, 'atis_airfare': False, 'atis_distance': False, 'atis_ground_service': False, 'atis_airline': False}, 'entities': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], 'spans': {}, 'links': {}}, 'token_annotation': {'ORTH': [' ', 'okay', 'what', 'flights', 'are', 'there', 'us', 'air', 'from', 'orlando', 'to', 'cleveland', 'leaving', 'in', 'the', 'afternoon'], 'SPACY': [False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False], 'TAG': ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''], 'LEMMA': ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''], 'POS': ['', '', '', '', '', '', '',

In [49]:
for label in labels:
    textcat.add_label(label)


In [50]:
textcat.initialize(lambda: train_examples, nlp=nlp)


In [51]:
epochs = 2
from tqdm.notebook import tqdm

with nlp.select_pipes(enable="textcat_multilabel"):

  optimizer = nlp.resume_training()

  for i in tqdm(range(epochs)):

    random.shuffle(train_examples)

    for example in tqdm(train_examples):

      nlp.update([example], sgd=optimizer)


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3982 [00:00<?, ?it/s]

  0%|          | 0/3982 [00:00<?, ?it/s]

In [52]:
train_examples = []
y_test = []
y_pred = []
for i, row in test.iterrows():
    test_text = row['text']
    y_test.append(row['intent'])
    pred_cats = nlp(test_text).cats
    pred_intent = '#'.join([item[0] for item in pred_cats.items() if item[1] > 0.7])
    y_pred.append(pred_intent)
    

In [53]:
from sklearn.metrics import accuracy_score
score = accuracy_score(y_test, y_pred)

In [54]:
score

0.9327309236947792