# Fetch bill data from HuggingFace repo

In [None]:
!pip3 install datasets

In [None]:
!pip3 install tqdm

In [1]:
import pandas as pd
from datasets import load_dataset
train_dataset = load_dataset("AmazonScience/massive", "en-US", split='train')
test_dataset = load_dataset("AmazonScience/massive", "en-US", split='test')
train = pd.DataFrame.from_dict(train_dataset)
test = pd.DataFrame.from_dict(test_dataset)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(train_dataset)

Dataset({
    features: ['id', 'locale', 'partition', 'scenario', 'intent', 'utt', 'annot_utt', 'worker_id', 'slot_method', 'judgments'],
    num_rows: 11514
})


In [3]:
train.head(3)

Unnamed: 0,id,locale,partition,scenario,intent,utt,annot_utt,worker_id,slot_method,judgments
0,1,en-US,train,16,48,wake me up at nine am on friday,wake me up at [time : nine am] on [date : friday],1,"{'slot': [], 'method': []}","{'worker_id': [], 'intent_score': [], 'slots_s..."
1,2,en-US,train,16,48,set an alarm for two hours from now,set an alarm for [time : two hours from now],1,"{'slot': [], 'method': []}","{'worker_id': [], 'intent_score': [], 'slots_s..."
2,4,en-US,train,10,46,olly quiet,olly quiet,1,"{'slot': [], 'method': []}","{'worker_id': [], 'intent_score': [], 'slots_s..."


# Explore the data

## Length of dataset

In [15]:
train_len = len(train)
test_len = len(test)
print(f"The train set is {train_len} rows long, the test dataset is {test_len} long. \
The test dataset represents {(test_len/train_len)*100}% of the train dataset.")

The train set is 11514 rows long, the test dataset is 2974 long. The test dataset represents 25.829425047767934% of the train dataset.


In [16]:
classes = set(train["intent"])
print(f"The train dataset contains {len(classes)} classes.")

The train dataset contains 60 classes.


## What's the distribution of utteraces per intent 

### In the training dataset

In [9]:
utt_per_intent = train.groupby('intent')['id'].nunique().reset_index()
utt_per_intent.columns = ['intent', 'unique_utt']
utt_per_intent.sort_values(by=["unique_utt"])
total_utts = sum(utt_per_intent["unique_utt"])
utt_per_intent["percentage"] = (utt_per_intent["unique_utt"]/total_utts) * 100
utt_per_intent.sort_values(by=["percentage"], ascending=False)

Unnamed: 0,intent,unique_utt,percentage
50,50,810,7.034914
45,45,639,5.549766
13,13,573,4.97655
32,32,566,4.915755
12,12,555,4.820219
49,49,544,4.724683
22,22,503,4.368595
44,44,418,3.630363
33,33,354,3.074518
0,0,350,3.039778


### In the test dataset

In [10]:
utt_per_intent_test = test.groupby('intent')['id'].nunique().reset_index()
utt_per_intent_test.columns = ['intent', 'unique_utt']
utt_per_intent_test.sort_values(by=["unique_utt"])
total_utts_test = sum(utt_per_intent["unique_utt"])
utt_per_intent_test["percentage"] = (utt_per_intent["unique_utt"]/total_utts_test) * 100
utt_per_intent_test.sort_values(by=["percentage"], ascending=False)

Unnamed: 0,intent,unique_utt,percentage
50,51,35,7.034914
45,46,32,5.549766
13,13,156,4.97655
32,32,126,4.915755
12,12,169,4.820219
49,50,209,4.724683
22,22,124,4.368595
44,45,176,3.630363
33,33,114,3.074518
0,0,88,3.039778


## Length of a utterance

In [23]:
from tqdm import tqdm
utt_length_words = []
utt_length_chars = []

for text in tqdm(train["utt"]):
    try:
        utt_length_words.append(len(text.split(" ")))
        utt_length_chars.append(len(text))
    except AttributeError:
        utt_length_words.append(0)
        utt_length_chars.append(0)

train["utt_length_words"] = utt_length_words
train["utt_length_chars"] = utt_length_chars
train.head(3)

100%|██████████████████████████████████████████████████████████████████████████████████| 11514/11514 [00:00<00:00, 858301.93it/s]


Unnamed: 0,id,locale,partition,scenario,intent,utt,annot_utt,worker_id,slot_method,judgments,utt_length_words,utt_length_chars
0,1,en-US,train,16,48,wake me up at nine am on friday,wake me up at [time : nine am] on [date : friday],1,"{'slot': [], 'method': []}","{'worker_id': [], 'intent_score': [], 'slots_s...",8,31
1,2,en-US,train,16,48,set an alarm for two hours from now,set an alarm for [time : two hours from now],1,"{'slot': [], 'method': []}","{'worker_id': [], 'intent_score': [], 'slots_s...",8,35
2,4,en-US,train,10,46,olly quiet,olly quiet,1,"{'slot': [], 'method': []}","{'worker_id': [], 'intent_score': [], 'slots_s...",2,10


In [24]:
utt_len_mean_words = train["utt_length_words"].mean()

In [25]:
utt_char_length_mean = train["utt_length_chars"].mean()

In [26]:
print(f"An utterance is on average {utt_len_mean_words} words long and {utt_char_length_mean} characters long")

An utterance is on average 6.924787215563661 words long and 35.03873545249262 characters long


In [29]:
train[train["utt_length_words"]>= 200]

Unnamed: 0,id,locale,partition,scenario,intent,utt,annot_utt,worker_id,slot_method,judgments,utt_length_words,utt_length_chars
