In [14]:
import pandas as pd

from src.data.preprocessing import clean_text
from src.data.vectorizer import get_vectorizer
from src.models.tfidf_logreg import get_model_v01
from src.models.tfidf_svm import get_model_v01 as get_svm_model_v01
from src.evaluation.metrics import evaluate, format_cm

In [15]:
# read data
DATA_PATH = "../data/raw/all_tickets_processed_improved_v3.csv"

df = pd.read_csv(DATA_PATH)
df.head()

Unnamed: 0,Document,Topic_group
0,connection with icon icon dear please setup ic...,Hardware
1,work experience user work experience user hi w...,Access
2,requesting for meeting requesting meeting hi p...,Hardware
3,reset passwords for external accounts re expir...,Access
4,mail verification warning hi has got attached ...,Miscellaneous


In [16]:
# split data
from sklearn.model_selection import train_test_split

X = df["Document"]
y = df["Topic_group"]

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, random_state=2, 
                                                    shuffle=True, stratify=y)

In [17]:
# X transforms
X_train_clean = X_train.apply(clean_text)
X_test_clean = X_test.apply(clean_text)

# vectorize
vectorizer = get_vectorizer()
X_train_vec = vectorizer.fit_transform(X_train_clean)
X_test_vec = vectorizer.transform(X_test_clean)

In [18]:
# lr model
lr_model = get_model_v01()
lr_model.fit(X_train_vec, y_train)

y_pred_lr = lr_model.predict(X_test_vec)

cm_lr = evaluate(y_test, y_pred_lr)

                       precision    recall  f1-score   support

               Access     0.9202    0.8575    0.8878      1425
Administrative rights     0.8889    0.6136    0.7261       352
           HR Support     0.8659    0.8694    0.8677      2183
             Hardware     0.7894    0.9152    0.8477      2724
     Internal Project     0.9263    0.7712    0.8417       424
        Miscellaneous     0.8397    0.8237    0.8316      1412
             Purchase     0.9654    0.8479    0.9028       493
              Storage     0.9496    0.8144    0.8768       555

             accuracy                         0.8559      9568
            macro avg     0.8932    0.8141    0.8478      9568
         weighted avg     0.8618    0.8559    0.8556      9568



In [19]:
# Confusion metrix of lr
format_cm(cm_lr, class_names= list(lr_model.classes_), normalize=True)

Unnamed: 0,Pred: Access,Pred: Administrative rights,Pred: HR Support,Pred: Hardware,Pred: Internal Project,Pred: Miscellaneous,Pred: Purchase,Pred: Storage
True: Access,0.857544,0.000702,0.04,0.072281,0.003509,0.022456,0.000702,0.002807
True: Administrative rights,0.019886,0.613636,0.011364,0.321023,0.0,0.028409,0.005682,0.0
True: HR Support,0.013284,0.00229,0.869446,0.076042,0.00229,0.03344,0.0,0.003207
True: Hardware,0.014317,0.005874,0.032305,0.915198,0.002203,0.024229,0.003671,0.002203
True: Internal Project,0.011792,0.002358,0.087264,0.080189,0.771226,0.044811,0.0,0.002358
True: Miscellaneous,0.012748,0.000708,0.050992,0.101275,0.005666,0.823654,0.001416,0.003541
True: Purchase,0.002028,0.002028,0.016227,0.105477,0.004057,0.020284,0.84787,0.002028
True: Storage,0.012613,0.003604,0.05045,0.097297,0.0,0.021622,0.0,0.814414


In [20]:
# SVM model
svm_model = get_svm_model_v01()
svm_model.fit(X_train_vec, y_train)

y_pred_svm = svm_model.predict(X_test_vec)
cm_svm = evaluate(y_test, y_pred_svm)

                       precision    recall  f1-score   support

               Access     0.9049    0.8744    0.8894      1425
Administrative rights     0.8378    0.7045    0.7654       352
           HR Support     0.8688    0.8644    0.8666      2183
             Hardware     0.8246    0.8855    0.8540      2724
     Internal Project     0.8765    0.8373    0.8565       424
        Miscellaneous     0.8346    0.8329    0.8337      1412
             Purchase     0.9493    0.8742    0.9102       493
              Storage     0.9208    0.8793    0.8995       555

             accuracy                         0.8615      9568
            macro avg     0.8772    0.8441    0.8594      9568
         weighted avg     0.8629    0.8615    0.8615      9568



In [21]:
# Confusion metrix of svm
format_cm(cm_svm,class_names= list(svm_model.classes_), normalize=True)

Unnamed: 0,Pred: Access,Pred: Administrative rights,Pred: HR Support,Pred: Hardware,Pred: Internal Project,Pred: Miscellaneous,Pred: Purchase,Pred: Storage
True: Access,0.874386,0.001404,0.034386,0.061053,0.003509,0.020351,0.002105,0.002807
True: Administrative rights,0.017045,0.704545,0.014205,0.221591,0.005682,0.022727,0.011364,0.002841
True: HR Support,0.015117,0.003207,0.864407,0.065048,0.007787,0.038479,0.000458,0.005497
True: Hardware,0.022394,0.010279,0.038546,0.885463,0.004405,0.029001,0.004038,0.005874
True: Internal Project,0.004717,0.0,0.061321,0.058962,0.837264,0.033019,0.002358,0.002358
True: Miscellaneous,0.016289,0.004249,0.052408,0.07932,0.008499,0.832861,0.002125,0.004249
True: Purchase,0.002028,0.006085,0.010142,0.079108,0.004057,0.020284,0.874239,0.004057
True: Storage,0.009009,0.003604,0.037838,0.054054,0.0,0.016216,0.0,0.879279


In [22]:
# save artifacts
import joblib
joblib.dump(vectorizer, "../artifacts/tfidf_vectorizer_v01.pkl")
joblib.dump(lr_model, "../artifacts/logreg_model_v01.pkl")
joblib.dump(svm_model, "../artifacts/svm_model_v01.pkl")

['../artifacts/svm_model_v01.pkl']