In [4]:
import pandas as pd
import re

# Load your data
df = pd.read_csv('stock_vs_nonstock_queries.csv')
# Print out total duplicate rows in the dataframe
print("Total duplicate rows:", df.duplicated().sum())
if df.duplicated().sum() > 0:
    print("Example duplicate rows:")
    print(df[df.duplicated()].head())
else:
    print("No duplicate rows found.")
# Remove duplicate rows
df = df.drop_duplicates()
print("After removing duplicates:", df.duplicated().sum())
# Simple text preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text
    # Print example of duplicate rows

# Apply preprocessing to the 'query' column
df['processed_query'] = df['query'].astype(str).apply(preprocess_text)

# Print 5 samples
print(df[['query', 'processed_query']].sample(5))


Total duplicate rows: 99890
Example duplicate rows:
                                     query     label
4                 How do I cook spaghetti?  nonstock
12                How do I cook spaghetti?  nonstock
14          What is the capital of Canada?  nonstock
15          Can you recommend a good book?  nonstock
20  What's the best way to clean a carpet?  nonstock
After removing duplicates: 0
                                          query  \
207    When is a good time to invest in Amazon?   
5      How is Netflix performing in the market?   
754      When is a good time to invest in Meta?   
216     How is Google performing in the market?   
225  What is the forecast for Microsoft shares?   

                               processed_query  
207    when is a good time to invest in amazon  
5      how is netflix performing in the market  
754      when is a good time to invest in meta  
216     how is google performing in the market  
225  what is the forecast for microsoft shares  


In [20]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    df['processed_query'], df['label'], test_size=0.2, random_state=42, stratify=df['label']
)

# Vectorize text
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_vec, y_train)
lr_pred = lr.predict(X_test_vec)
lr_acc = accuracy_score(y_test, lr_pred)

# SVM
svm = LinearSVC()
svm.fit(X_train_vec, y_train)
svm_pred = svm.predict(X_test_vec)
svm_acc = accuracy_score(y_test, svm_pred)

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_vec, y_train)
rf_pred = rf.predict(X_test_vec)
rf_acc = accuracy_score(y_test, rf_pred)

print(f"Logistic Regression Accuracy: {lr_acc:.4f}")
print(f"SVM Accuracy: {svm_acc:.4f}")
print(f"Random Forest Accuracy: {rf_acc:.4f}")

Logistic Regression Accuracy: 1.0000
SVM Accuracy: 1.0000
Random Forest Accuracy: 1.0000


In [7]:
# Example queries
queries = [
    "Can a banana drive a car?",
    "What is the square root of a sandwich?",
    "How many unicorns fit in a teacup?",
    "Translate 'meow' to dolphin language.",
    "Is it raining spaghetti on Mars today?"
]

# Preprocess example queries
processed_queries = [preprocess_text(q) for q in queries]

# Vectorize example queries
queries_vec = vectorizer.transform(processed_queries)

# Predict with all three models
lr_preds = lr.predict(queries_vec)
svm_preds = svm.predict(queries_vec)
rf_preds = rf.predict(queries_vec)

# Display predictions
for i, q in enumerate(queries):
    print(f"Query: {q}")
    print(f"  Logistic Regression: {lr_preds[i]}")
    print(f"  SVM: {svm_preds[i]}")
    print(f"  Random Forest: {rf_preds[i]}")
    print()

Query: Can a banana drive a car?
  Logistic Regression: not_stock
  SVM: not_stock
  Random Forest: not_stock

Query: What is the square root of a sandwich?
  Logistic Regression: stock
  SVM: stock
  Random Forest: stock

Query: How many unicorns fit in a teacup?
  Logistic Regression: not_stock
  SVM: not_stock
  Random Forest: not_stock

Query: Translate 'meow' to dolphin language.
  Logistic Regression: not_stock
  SVM: not_stock
  Random Forest: not_stock

Query: Is it raining spaghetti on Mars today?
  Logistic Regression: not_stock
  SVM: not_stock
  Random Forest: not_stock



In [10]:
from transformers import DistilBertTokenizerFast, TFDistilBertForSequenceClassification
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder

# Encode labels
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

# Load tokenizer and model
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(le.classes_))

# Tokenize data
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=64)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=64)

# Convert to tf.data.Dataset
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train_enc
)).batch(16)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test_enc
)).batch(16)

# Compile and train
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

model.fit(train_dataset, epochs=2, validation_data=test_dataset)

# Evaluate
loss, accuracy = model.evaluate(test_dataset)
print(f"DistilBERT Test Accuracy: {accuracy:.4f}")




Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`





Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

Epoch 1/2


Epoch 2/2
DistilBERT Test Accuracy: 1.0000


In [18]:
# Chat loop to test DistilBERT model
while True:
    user_input = input("Enter your query (or type 'exit' to quit): ")
    if user_input.lower() == 'exit':
        break
    # Preprocess input
    processed = preprocess_text(user_input)
    # Tokenize
    encoding = tokenizer([processed], truncation=True, padding=True, max_length=64, return_tensors='tf')
    # Predict
    logits = model(encoding).logits
    pred_label_id = tf.argmax(logits, axis=1).numpy()[0]
    pred_label = le.inverse_transform([pred_label_id])[0]
    print(f"Prediction: {pred_label}")

Prediction: not_stock
