In [1]:
import pandas as pd
import re
import pickle
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix
)

In [2]:
data = pd.read_csv("service_intents.csv")

# Drop missing values
data.dropna(inplace=True)

# Remove duplicates
data.drop_duplicates(inplace=True)

In [3]:
def clean_text(text):
    text = text.lower()                     # lowercase
    text = re.sub(r'[^a-z\s]', '', text)    # remove punctuation & numbers
    text = re.sub(r'\s+', ' ', text).strip()
    return text

data["clean_text"] = data["text"].apply(clean_text)

X = data["clean_text"]
y = data["category"]

In [4]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Second split: Validation + Test
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print(f"Train size: {len(X_train)}")
print(f"Validation size: {len(X_val)}")
print(f"Test size: {len(X_test)}")


Train size: 339
Validation size: 73
Test size: 73


In [5]:
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),     # unigrams + bigrams
    max_df=0.9,             # remove very common words
    min_df=2,               # remove very rare words
    sublinear_tf=True
)

X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)
X_test_vec = vectorizer.transform(X_test)

In [6]:
model = LogisticRegression(
    max_iter=2000,
    class_weight="balanced",  # handles class imbalance
    solver="lbfgs"
)

model.fit(X_train_vec, y_train)

In [7]:
y_val_pred = model.predict(X_val_vec)

print("\nðŸ”¹ VALIDATION RESULTS")
print("Accuracy:", accuracy_score(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred))


ðŸ”¹ VALIDATION RESULTS
Accuracy: 0.6301369863013698
                      precision    recall  f1-score   support

       ac_technician       1.00      0.80      0.89         5
    appliance_repair       0.60      0.60      0.60         5
 automobile_mechanic       1.00      0.67      0.80         3
           carpenter       0.50      0.20      0.29         5
            cleaning       1.00      0.83      0.91         6
     computer_repair       0.38      0.60      0.46         5
         electrician       0.33      0.40      0.36         5
      gas_technician       0.00      0.00      0.00         1
  general_contractor       0.50      1.00      0.67         6
             glazier       0.50      1.00      0.67         1
     home_automation       0.00      0.00      0.00         3
           locksmith       1.00      0.50      0.67         2
       mobile_repair       0.83      1.00      0.91         5
             painter       1.00      0.60      0.75         5
        pest_co

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
y_test_pred = model.predict(X_test_vec)

print("\nðŸ”¹ TEST RESULTS")
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))


ðŸ”¹ TEST RESULTS
Accuracy: 0.589041095890411
                      precision    recall  f1-score   support

       ac_technician       1.00      0.75      0.86         4
    appliance_repair       0.50      0.60      0.55         5
 automobile_mechanic       1.00      0.67      0.80         3
           carpenter       0.57      0.67      0.62         6
            cleaning       1.00      0.60      0.75         5
     computer_repair       0.43      0.60      0.50         5
         electrician       0.33      0.33      0.33         6
      gas_technician       0.67      1.00      0.80         2
  general_contractor       0.25      0.20      0.22         5
             glazier       1.00      0.50      0.67         2
     home_automation       0.50      0.33      0.40         3
           locksmith       1.00      0.50      0.67         2
       mobile_repair       1.00      1.00      1.00         6
             painter       1.00      0.60      0.75         5
        pest_control  

In [9]:
with open("model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

print("\nâœ… Model & vectorizer saved successfully")


âœ… Model & vectorizer saved successfully
