In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

In [8]:
df = pd.read_csv("multilingual_seed_dataset.csv")
df.head(20)

Unnamed: 0,text,intent,language
0,FR: The fire alarm is ringing,fire_emergency,fr
1,My friend is bleeding,medical_help,en
2,HI: My brother is unconscious,medical_help,hi
3,ES: My child is missing,lost_person,es
4,BN: Someone threatened me,police_assistance,bn
5,BN: We need a doctor immediately,medical_help,bn
6,ES: He is missing since morning,lost_person,es
7,Nobody is responding,other_emergency,en
8,We need a doctor immediately,medical_help,en
9,BN: Someone is looking for their daughter,lost_person,bn


In [None]:
df["clean_text"] = df["text"].str.replace(r"^(HI|ES|BN|FR): ", "", regex=True)

In [10]:
df.head(20)

Unnamed: 0,text,intent,language,clean_text
0,FR: The fire alarm is ringing,fire_emergency,fr,The fire alarm is ringing
1,My friend is bleeding,medical_help,en,My friend is bleeding
2,HI: My brother is unconscious,medical_help,hi,My brother is unconscious
3,ES: My child is missing,lost_person,es,My child is missing
4,BN: Someone threatened me,police_assistance,bn,Someone threatened me
5,BN: We need a doctor immediately,medical_help,bn,We need a doctor immediately
6,ES: He is missing since morning,lost_person,es,He is missing since morning
7,Nobody is responding,other_emergency,en,Nobody is responding
8,We need a doctor immediately,medical_help,en,We need a doctor immediately
9,BN: Someone is looking for their daughter,lost_person,bn,Someone is looking for their daughter


In [11]:
X = df["clean_text"]
y = df["intent"]

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# TF-IDF vectorizer
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [14]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Linear SVM": LinearSVC(),
    "Multinomial NB": MultinomialNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}

In [15]:
best_model = None
best_accuracy = 0

for name, model in models.items():
    model.fit(X_train_vec, y_train)
    preds = model.predict(X_test_vec)
    acc = accuracy_score(y_test, preds)
    print(f"🔹 {name} Accuracy: {acc:.4f}")
    print(classification_report(y_test, preds))
    
    if acc > best_accuracy:
        best_accuracy = acc
        best_model = model
        best_model_name = name

🔹 Logistic Regression Accuracy: 1.0000
                   precision    recall  f1-score   support

   fire_emergency       1.00      1.00      1.00        11
      lost_person       1.00      1.00      1.00         9
     medical_help       1.00      1.00      1.00        11
 natural_disaster       1.00      1.00      1.00        10
  other_emergency       1.00      1.00      1.00        13
police_assistance       1.00      1.00      1.00         6

         accuracy                           1.00        60
        macro avg       1.00      1.00      1.00        60
     weighted avg       1.00      1.00      1.00        60

🔹 Linear SVM Accuracy: 1.0000
                   precision    recall  f1-score   support

   fire_emergency       1.00      1.00      1.00        11
      lost_person       1.00      1.00      1.00         9
     medical_help       1.00      1.00      1.00        11
 natural_disaster       1.00      1.00      1.00        10
  other_emergency       1.00      1.00    

In [17]:
joblib.dump(best_model, "intent_model.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")
print(f"\n Saved best model: {best_model_name} with {best_accuracy:.4f} accuracy")


 Saved best model: Logistic Regression with 1.0000 accuracy
