# LSAP Data Inspection & Retrieval
---
This notebook contains code for retrieving the SNIPS dataset used in our NLP project. 

## Install Libraries & Configure Settings

In [8]:
import numpy as np
import pandas as pd

#Set to True to combine all data or False for "train.csv", "val.csv", "test.csv"
SPLIT = False

#Set train, val, and test split sizes
TRAIN_SPLIT = 0.6
VAL_SPLIT   = 0.2
TEST_SPLIT  = 0.2

In [9]:

train = pd.read_csv( "train.csv" )
val   = pd.read_csv( "validation.csv" )
test  = pd.read_csv( "test.csv" )

train.sample( frac = 1 ).head( 10 )

Unnamed: 0,label,text
9485,SearchCreativeWork,find the song The Fourth Man
2714,BookRestaurant,book a pub for pepperoni near their house and ...
11291,SearchScreeningEvent,Find the movie schedules for Consolidated Thea...
3035,BookRestaurant,I want to book a restaurant within walking dis...
4753,GetWeather,In Bon Secour National Wildlife Refuge at twel...
9082,RateBook,Rate the current book 5 of 6 points
8909,RateBook,I want to rate A Long Short War series a four ...
12804,SearchScreeningEvent,Find time for Hendthighelbedi at the nearest c...
4206,GetWeather,"Does the forecast show a storm in Maquoketa , ..."
4623,GetWeather,What will the weather be like in Port Clinton ...


In [2]:
#plot label distribution for train, val, and test on the same plot
import plotly.express as px

def get_label_count( df ):
    return df.groupby( "label" ).count().reset_index()

#Add train, val, and test data to plot
fig = px.bar(
    data_frame = get_label_count( train ),
    x = "label", 
    y = "text", 
    title = "Intent Label Distribution"
)

fig.add_bar( 
    x = get_label_count( val )[ "label" ],
    y = get_label_count( val )[ "text" ], 
    name = "Validation" 
)

fig.add_bar( 
    x = get_label_count( test )[ "label" ],
    y = get_label_count( test )[ "text" ], 
    name = "Test" 
)

#Update Axis & Legend
fig.update_layout( legend_title_text = "Dataset" )
fig.update_xaxes( title_text = "Intent" )
fig.update_yaxes( title_text = "Intent Frequency" )

fig.show()

## Preprocess SNIPS Data
Split up the words in each label.

In [6]:
label_mapping = {
    "AddToPlaylist": "Add To Playlist",
    "BookRestaurant": "Book Restaurant",
    "GetWeather": "Get Weather",
    "PlayMusic": "Play Music",
    "RateBook": "Rate Book",
    "SearchCreativeWork": "Search Creative Work",
    "SearchScreeningEvent": "Search Screening Event"
}

def clean_label( label ):
    return f"{ label_mapping[ label ].strip() }."

#Combine train, val, and test data
combined_df = pd.concat( [ train, val, test ] )
combined_df[ "label" ] = combined_df[ "label" ].apply( clean_label )

#Rename text to utterance and label to intent
combined_df = combined_df.rename( columns = { "text": "utterance", "label": "intent" } )
combined_df.sample( frac = 1 ).head( 10 )

Unnamed: 0,intent,utterance
3437,Book Restaurant.,book a table this evening in Saint Vincent and...
6352,Play Music.,Please play good music by Will Oldham .
2071,Book Restaurant.,I need a table in East Timor for a party of ni...
7454,Play Music.,play 1962 music on Netflix
4965,Get Weather.,what is the forecast for here and now
8839,Rate Book.,rate Orion in the Dying Time four stars
11597,Search Screening Event.,Is The Happy Hooker Goes Hollywood at the movi...
371,Play Music.,Play a track by Yui on Vimeo
6653,Play Music.,Use Deezer service to play Opera
238,Add To Playlist.,Add inconfundible to the piano in the backgrou...


In [7]:
combined_df.to_csv( "data.csv" )