In [None]:
import pandas as pd
import re

# Simple text preprocessing function
def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'[^a-z0-9\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

# Apply preprocessing to the 'query' column
df['processed_query'] = df['query'].astype(str).apply(preprocess_text)

# Print 5 samples
print(df[['query', 'processed_query']].sample(5))


                                      query  \
4525  Give me the latest news on AMD stock.   
1329             Show me the RSI for Tesla.   
2466                Is NVDA a good buy now?   
2058  Display the moving average for GOOGL.   
5514  Give me the latest news on AMD stock.   

                           processed_query  
4525  give me the latest news on amd stock  
1329             show me the rsi for tesla  
2466                is nvda a good buy now  
2058  display the moving average for googl  
5514  give me the latest news on amd stock  


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    df['processed_query'], df['label'], test_size=0.2, random_state=42, stratify=df['label']
)

# Vectorize text
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_vec, y_train)
lr_pred = lr.predict(X_test_vec)
lr_acc = accuracy_score(y_test, lr_pred)

# SVM
svm = LinearSVC()
svm.fit(X_train_vec, y_train)
svm_pred = svm.predict(X_test_vec)
svm_acc = accuracy_score(y_test, svm_pred)

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_vec, y_train)
rf_pred = rf.predict(X_test_vec)
rf_acc = accuracy_score(y_test, rf_pred)

print(f"Logistic Regression Accuracy: {lr_acc:.4f}")
print(f"SVM Accuracy: {svm_acc:.4f}")
print(f"Random Forest Accuracy: {rf_acc:.4f}")

Logistic Regression Accuracy: 1.0000
SVM Accuracy: 1.0000
Random Forest Accuracy: 1.0000


In [21]:
# Example queries
queries = [
    "Can a banana drive a car?",
    "What is the square root of a sandwich?",
    "How many unicorns fit in a teacup?",
    "Translate 'meow' to dolphin language.",
    "Is it raining spaghetti on Mars today?"
]

# Preprocess example queries
processed_queries = [preprocess_text(q) for q in queries]

# Vectorize example queries
queries_vec = vectorizer.transform(processed_queries)

# Predict with all three models
lr_preds = lr.predict(queries_vec)
svm_preds = svm.predict(queries_vec)
rf_preds = rf.predict(queries_vec)

# Display predictions
for i, q in enumerate(queries):
    print(f"Query: {q}")
    print(f"  Logistic Regression: {lr_preds[i]}")
    print(f"  SVM: {svm_preds[i]}")
    print(f"  Random Forest: {rf_preds[i]}")
    print()

Query: Can a banana drive a car?
  Logistic Regression: not_stock
  SVM: not_stock
  Random Forest: not_stock

Query: What is the square root of a sandwich?
  Logistic Regression: stock
  SVM: stock
  Random Forest: stock

Query: How many unicorns fit in a teacup?
  Logistic Regression: not_stock
  SVM: not_stock
  Random Forest: not_stock

Query: Translate 'meow' to dolphin language.
  Logistic Regression: not_stock
  SVM: not_stock
  Random Forest: not_stock

Query: Is it raining spaghetti on Mars today?
  Logistic Regression: not_stock
  SVM: not_stock
  Random Forest: not_stock



In [36]:
from transformers import DistilBertTokenizerFast, TFDistilBertForSequenceClassification
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder

# Encode labels
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

# Load tokenizer and model
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(le.classes_))

# Tokenize data
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=64)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=64)

# Convert to tf.data.Dataset
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train_enc
)).batch(16)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test_enc
)).batch(16)

# Compile and train
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

model.fit(train_dataset, epochs=2, validation_data=test_dataset)

# Evaluate
loss, accuracy = model.evaluate(test_dataset)
print(f"DistilBERT Test Accuracy: {accuracy:.4f}")

ModuleNotFoundError: No module named 'tensorflow'