## Train a real ML model

In [44]:
import pandas as pd
import re
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense

In [29]:
# Load the dataset
df = pd.read_csv("F:/ML_DATASETS/NLP/Voice Search AI Conversational Queries 2025/voice_search_query_captures.csv")


In [30]:
# Peek at the data
print(df.head())
print(df.columns)

   query_id                               user_id            timestamp  \
0         1  bdd640fb-0667-4ad1-9c80-317fa3b1799d  2025-04-17 19:27:32   
1         2  bc8960a9-23b8-41e9-b924-56de3eb13b90  2025-02-09 19:19:27   
2         3  8b9d2434-e465-4150-bd9c-66b3ad3c2d6d  2025-02-03 18:19:43   
3         4  07a0ca6e-0822-48f3-ac03-1199972a8469  2025-02-06 09:18:10   
4         5  9a1de644-815e-46d1-bb8f-aa1837f8a88b  2025-01-11 07:19:59   

     device_type                      query_text language       intent  \
0     smartphone   How tall is the Eiffel Tower?  Spanish  information   
1     smartphone           Track my Amazon order  English     shopping   
2  car assistant  Define artificial intelligence  Spanish  information   
3  car assistant           Set an alarm for 7 AM  Spanish      command   
4  smart speaker           Set an alarm for 7 AM  English      command   

      location  query_duration_sec  num_words is_successful  confidence_score  \
0  Los Angeles               

## 🎬 Step 2a: Clean and Tokenize the Text
I'm starting with the query_text column:

* Lowercase everything

* Remove punctuation

* Tokenize into words

In [31]:
# Basic text cleaning
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    return text

df['clean_query'] = df['query_text'].apply(clean_text)

In [32]:
# Preview cleaned text
print(df[['query_text', 'clean_query']].head())

                       query_text                     clean_query
0   How tall is the Eiffel Tower?    how tall is the eiffel tower
1           Track my Amazon order           track my amazon order
2  Define artificial intelligence  define artificial intelligence
3           Set an alarm for 7 AM           set an alarm for 7 am
4           Set an alarm for 7 AM           set an alarm for 7 am


## 🎬 Step 2b: Encode the Text
Converting words into numbers using tokenization + padding. Using Tokenizer from Keras

In [33]:
# Tokenize
tokenizer = Tokenizer(num_words=1000, oov_token="<OOV>")
tokenizer.fit_on_texts(df['clean_query'])

sequences = tokenizer.texts_to_sequences(df['clean_query'])
padded_sequences = pad_sequences(sequences, padding='post', maxlen=20)

In [34]:
print(padded_sequences[:5])

[[ 7 54 14  3 55 56  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [57 11 58 10  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [81 82 83  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [45 46 47 48 49 50  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [45 46 47 48 49 50  0  0  0  0  0  0  0  0  0  0  0  0  0  0]]


## 🎬 Step 2c: Encode the Labels (intent)
Turning the intent column into numerical targets:

In [35]:
label_encoder = LabelEncoder()
df['intent_encoded'] = label_encoder.fit_transform(df['intent'])

In [36]:
print(df[['intent', 'intent_encoded']].drop_duplicates())

          intent  intent_encoded
0    information               2
1       shopping               4
3        command               0
7     navigation               3
8  entertainment               1


## 🎬 Step 3: Train/Test Split
Now splitting the data so I can train and evaluate:

In [37]:
X = padded_sequences
y = df['intent_encoded'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 🎬 Step 4a: Define the Model Architecture and Compile
I'll use TensorFlow/Keras to build a simple feedforward neural network. It’ll take my padded query sequences as input and predict the intent class.

In [39]:
# Parameters
vocab_size = 1000  # same as tokenizer num_words
embedding_dim = 16
max_length = 20    # same as pad_sequences maxlen
num_classes = len(df['intent'].unique())

# Model definition
model = Sequential([
    Embedding(vocab_size, embedding_dim),
    GlobalAveragePooling1D(),
    Dense(24, activation='relu'),
    Dense(num_classes, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()


## 🎬 Step 4b: Train the Model
Now feeding in my training data:

This will:
* Train for 10 epochs

* Show accuracy and loss for both train and test sets

* Give you a history object for plotting later

In [40]:
history = model.fit(X_train, y_train,
                    epochs=10,
                    validation_data=(X_test, y_test),
                    verbose=2)

Epoch 1/10
39/39 - 1s - 31ms/step - accuracy: 0.2178 - loss: 1.6004 - val_accuracy: 0.2283 - val_loss: 1.5840
Epoch 2/10
39/39 - 0s - 5ms/step - accuracy: 0.3248 - loss: 1.5613 - val_accuracy: 0.3248 - val_loss: 1.5288
Epoch 3/10
39/39 - 0s - 3ms/step - accuracy: 0.5691 - loss: 1.4720 - val_accuracy: 0.7653 - val_loss: 1.3940
Epoch 4/10
39/39 - 0s - 3ms/step - accuracy: 0.7605 - loss: 1.3048 - val_accuracy: 0.8714 - val_loss: 1.1889
Epoch 5/10
39/39 - 0s - 4ms/step - accuracy: 0.8569 - loss: 1.0763 - val_accuracy: 0.9260 - val_loss: 0.9386
Epoch 6/10
39/39 - 0s - 4ms/step - accuracy: 0.9735 - loss: 0.8246 - val_accuracy: 1.0000 - val_loss: 0.6956
Epoch 7/10
39/39 - 0s - 4ms/step - accuracy: 0.9839 - loss: 0.5989 - val_accuracy: 1.0000 - val_loss: 0.4966
Epoch 8/10
39/39 - 0s - 4ms/step - accuracy: 1.0000 - loss: 0.4216 - val_accuracy: 1.0000 - val_loss: 0.3500
Epoch 9/10
39/39 - 0s - 4ms/step - accuracy: 1.0000 - loss: 0.2951 - val_accuracy: 1.0000 - val_loss: 0.2440
Epoch 10/10
39/39 

## 🎬 Step 5a: Prepare Unlabeled Queries

Let’s simulate a few new voice search inputs. You can either:

* Pull real examples from your dataset (without the intent)

* Or manually define a few test queries like:

In [41]:
new_queries = [
    "Play jazz music on Spotify",
    "What's the weather in Tokyo tomorrow?",
    "Turn off the living room lights",
    "Order me a pizza from Domino's",
    "How far is the moon from Earth?"
]

## 🎬 Step 5b: Preprocess Like Training
We need to clean and tokenize these new queries using the same pipeline:

In [42]:
# Clean the text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

cleaned_queries = [clean_text(q) for q in new_queries]

# Tokenize and pad
new_sequences = tokenizer.texts_to_sequences(cleaned_queries)
new_padded = pad_sequences(new_sequences, padding='post', maxlen=20)


## 🎬 Step 5c: Make Predictions (TEST on unlabeled data)
Now we feed the preprocessed queries into your trained model:

In [45]:
predictions = model.predict(new_padded)
predicted_labels = [label_encoder.inverse_transform([np.argmax(p)])[0] for p in predictions]

for query, label in zip(new_queries, predicted_labels):
    print(f"Query: '{query}' → Predicted Intent: '{label}'")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
Query: 'Play jazz music on Spotify' → Predicted Intent: 'entertainment'
Query: 'What's the weather in Tokyo tomorrow?' → Predicted Intent: 'information'
Query: 'Turn off the living room lights' → Predicted Intent: 'command'
Query: 'Order me a pizza from Domino's' → Predicted Intent: 'shopping'
Query: 'How far is the moon from Earth?' → Predicted Intent: 'information'
