## Import

In [1]:
import sys
import nltk
import numpy as np
import pandas as pd
import pickle
import os

from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn import metrics
from matplotlib import pyplot as plt

sys.path.append(".")
sys.path.append("..")

## Set Classifier

In [2]:
# Classifier
classifier = "DT"
# Supported algorithms:
# DT
# NB
# SVM

## Load datasets

In [3]:
dfTickets = pd.read_csv(
    "./generated data/"+classifier+"_output.csv",
    dtype=str
)

## Split train and test data

In [4]:
dic = {'urgency': dfTickets["urgency"].values,
       'impact': dfTickets["impact"].values,
       'priority': dfTickets["priority"].values}

df = pd.DataFrame(dic)

variables = df.iloc[:,:-1]
results = df.iloc[:,-1]

train_data, test_data, train_labels, test_labels = train_test_split(
    variables, results, test_size=0.2
)

## Train data

In [5]:
if classifier == "NB":
    clf = MultinomialNB()
elif classifier == "SVM":
    clf = SVC(kernel='linear')
else:
    clf = DecisionTreeClassifier()

clf = clf.fit(train_data, train_labels)    

## Evaluate Model

In [6]:
# Score and evaluate model on test data using model without hyperparameter tuning
predicted = clf.predict(test_data)
prediction_acc = np.mean(predicted == test_labels)
print("Confusion:")
print(metrics.confusion_matrix(test_labels, predicted))
print("Mean: " + str(prediction_acc))

Confusion:
[[ 30   3   2]
 [  4  49  27]
 [  6  21 174]]
Mean: 0.8006329113924051


## Print the Classification Report

In [7]:
%matplotlib inline
from sklearn.metrics import classification_report
print(classification_report(test_labels, predicted,
                            target_names=np.unique(test_labels)))

              precision    recall  f1-score   support

           1       0.75      0.86      0.80        35
           2       0.67      0.61      0.64        80
           3       0.86      0.87      0.86       201

    accuracy                           0.80       316
   macro avg       0.76      0.78      0.77       316
weighted avg       0.80      0.80      0.80       316



## Save pickle

In [8]:
pickle.dump(clf, open('./model/'+classifier+'_priority_model.pickle',"wb"))