In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from indicnlp.tokenize import indic_tokenize

In [15]:
# Load your dataset 
data = pd.read_csv('odia.csv')

In [16]:
# Load or define a odia stop words list
odia_stopwords = ["ଅ","ଆମ","ଆସି","ଏ","ଅଉ","ଓ","ଯେ","କେ","କେବଳ","ଏଥିରୁ"]

In [17]:
# Preprocessing with odia stop words and tokenizer
def preprocess_odia_text(text):
    # Tokenize the text using the odia tokenizer
    tokens = indic_tokenize.trivial_tokenize(text)
    
    # Remove stop words
    tokens = [word for word in tokens if word not in odia_stopwords]
    
    # Join the tokens back into a text string
    processed_text = ' '.join(tokens)
    
    return processed_text

data['query'] = data['query'].apply(preprocess_odia_text)

In [18]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['query'], data['intent'], test_size=0.2, random_state=32)

In [19]:
# Feature extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [20]:
# Choose a classifier
classifier = MultinomialNB()
# Train the classifier
classifier.fit(X_train_tfidf, y_train)

MultinomialNB()

In [21]:
# Make predictions on the testing data
y_pred = classifier.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)


print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.7222222222222222
Classification Report:
               precision    recall  f1-score   support

          No       0.60      0.50      0.55         6
         Yes       0.77      0.83      0.80        12

    accuracy                           0.72        18
   macro avg       0.68      0.67      0.67        18
weighted avg       0.71      0.72      0.72        18



In [31]:
#Inference Function with "ମୁଁ ବୁଝି ପାରୁନି" Threshold

def predict_intent(input_text, threshold=0.5):
    input_text = input_text
    input_text = tfidf_vectorizer.transform([input_text])
    probabilities = classifier.predict_proba(input_text)[0]
    
    max_prob = max(probabilities)
    if max_prob < threshold:
        return "ମୁଁ ବୁଝି ପାରୁନି"
        
    intent = classifier.classes_[np.argmax(probabilities)]
    return intent

In [32]:
# Testing the predict_intent function with odia text
input_text = "ଠିକ"
intent = predict_intent(input_text)
print(f"intent '{input_text}': {intent}")

intent 'ଠିକ': Yes


In [30]:
# Testing the predict_intent function with "ଖାଦ୍ୟ"
input_text = "ଆ"
intent = predict_intent(input_text)
print(f"Intent for '{input_text}': {intent}")

Intent for 'ଆ': ମୁଁ ବୁଝି ପାରୁନି


In [24]:
# Testing the predict_intent function with "ଅନୁଚିତ"
input_text = "ଅନୁଚିତ"
intent = predict_intent(input_text)
print(f"Intent for '{input_text}': {intent}")

Intent for 'ଅନୁଚିତ': No
