In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from indicnlp.tokenize import indic_tokenize

In [14]:
# Load your dataset 
data = pd.read_csv('hindi.csv')

In [75]:
# Load or define a Hindi stop words list
hindi_stopwords = ["यह", "वह", "की", "के", "है","मैं ","करता","हूं"]

In [76]:
# Preprocessing with Hindi stop words and tokenizer
def preprocess_hindi_text(text):
    # Tokenize the text using the Hindi tokenizer
    tokens = indic_tokenize.trivial_tokenize(text)
    
    # Remove stop words
    tokens = [word for word in tokens if word not in hindi_stopwords]
    
    # Join the tokens back into a text string
    processed_text = ' '.join(tokens)
    
    return processed_text

data['query'] = data['query'].apply(preprocess_hindi_text)

In [77]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['query'], data['intent'], test_size=0.2, random_state=42)

In [78]:
# Feature extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [79]:
# Choose a classifier
classifier = MultinomialNB()
# Train the classifier
classifier.fit(X_train_tfidf, y_train)

MultinomialNB()

In [80]:
# Make predictions on the testing data
y_pred = classifier.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)


print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.5384615384615384
Classification Report:
               precision    recall  f1-score   support

          No       0.55      0.86      0.67         7
         Yes       0.50      0.17      0.25         6

    accuracy                           0.54        13
   macro avg       0.52      0.51      0.46        13
weighted avg       0.52      0.54      0.47        13



In [81]:
#Inference Function with "मुझे समझ नहीं आया" Threshold

def predict_intent(input_text, threshold=0.65):
    input_text = input_text
    input_text = tfidf_vectorizer.transform([input_text])
    probabilities = classifier.predict_proba(input_text)[0]
    
    max_prob = max(probabilities)
    if max_prob < threshold:
        return "मुझे समझ नहीं आया"
        
    intent = classifier.classes_[np.argmax(probabilities)]
    return intent

In [85]:
# Testing the predict_intent function with Hindi text
input_text = "आगे बढ़ो"
intent = predict_intent(input_text)
print(f"intent '{input_text}': {intent}")

intent 'आगे बढ़ो': Yes


In [83]:
# Testing the predict_intent function with "खाना"
input_text = "खाना"
intent = predict_intent(input_text)
print(f"Intent for '{input_text}': {intent}")

Intent for 'खाना': मुझे समझ नहीं आया


In [84]:
# Testing the predict_intent function with "गलत"
input_text = "गलत"
intent = predict_intent(input_text)
print(f"Intent for '{input_text}': {intent}")

Intent for 'गलत': No
