## Import

In [1]:
import sys
import nltk
import numpy as np
import pandas as pd
import pickle

from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

from sklearn import metrics
import os
from matplotlib import pyplot as plt
sys.path.append(".")
sys.path.append("..")

## Set Classifier

In [2]:
classifier = "SVM"

## Load datasets

In [3]:
dfTickets = pd.read_csv(
    "./test data/xtremax/"+classifier+"_weight.csv",
    dtype=str
)

## Split train and test data

In [4]:
dic = {'urgency': dfTickets["urgency"].values,
       'impact': dfTickets["impact"].values,
       'priority': dfTickets["priority"].values}

df = pd.DataFrame(dic)

variables = df.iloc[:,:-1]
results = df.iloc[:,-1]

train_data, test_data, train_labels, test_labels = train_test_split(
    variables, results, test_size=0.2
)

## Train data

In [5]:
if classifier == "NB":
    text_clf = MultinomialNB()
elif classifier == "D3":
    text_clf = DecisionTreeClassifier()
elif classifier == "SVM":
    text_clf = SVC(kernel='linear')
elif classifier == "KNN":
    text_clf = KNeighborsClassifier(n_neighbors = 3)
else:
    text_clf = DecisionTreeClassifier()

text_clf = text_clf.fit(train_data, train_labels)    

## Evaluate Model

In [6]:
print("Evaluating model")
# Score and evaluate model on test data using model without hyperparameter tuning
predicted = text_clf.predict(test_data)
prediction_acc = np.mean(predicted == test_labels)
print("Confusion:")
print(metrics.confusion_matrix(test_labels, predicted))
print("Mean: " + str(prediction_acc))

Evaluating model
Confusion:
[[  2  19  17]
 [  0  31  46]
 [  0  15 186]]
Mean: 0.6930379746835443


## Plot the Confusion Matrix

In [None]:
# Ploting confusion matrix with 'seaborn' module
# Use below line only with Jupyter Notebook
%matplotlib inline
import seaborn as sns
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import matplotlib
mat = confusion_matrix(test_labels, predicted)
plt.figure(figsize=(4, 4))
sns.set()
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False,
            xticklabels=np.unique(test_labels),
            yticklabels=np.unique(test_labels))
plt.xlabel('true label')
plt.ylabel('predicted label')
# Save confusion matrix to outputs in Workbench
# plt.savefig(os.path.join('.', 'outputs', 'confusion_matrix.png'))
plt.show()

## Print the Classification Report

In [None]:
%matplotlib inline
from sklearn.metrics import classification_report
print(classification_report(test_labels, predicted,
                            target_names=np.unique(test_labels)))

## Save pickle

In [None]:
pickle.dump(text_clf, open('./pickle/xtremax/'+classifier+'_priority_model.pickle',"wb"))

## Test by manual input

In [None]:
test_dic = {'urgency': ["1","3","1"],
       'impact': ["1","1","3"]}

test_data2 = pd.DataFrame(test_dic)
test_pred = text_clf.predict(test_data2)
print(test_pred)