# Imports

In [1]:
import sys
import nltk
import numpy as np
import pandas as pd
import pickle
import os

from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from sklearn import metrics

sys.path.append(".")
sys.path.append("..")

# Set Params

In [2]:
# Input
text_columns = "description"
# Supported datasets:
# summary
# description

# Output
column_to_predict = "urgency"
# Supported datasets:
# impact
# urgency

# Classifier
classifier = "SVM"
# Supported algorithms:
# DT
# NB
# SVM

# Removes stop words from processed text
remove_stop_words = True
stop_words_lang = 'english'

# Word stemming using NLTK
use_stemming = False

# fit_prior: whether to learn class prior probabilities or not.
# If false, a uniform prior will be used.
if use_stemming:
    fit_prior = False
else:
    fit_prior = True
    
le = preprocessing.LabelEncoder()

# Load Data Set

In [3]:
dfTickets = pd.read_csv(
    './dataset/tickets_1.csv',
    engine='python',
    dtype=str
)  

# Split Data Set into Train and Test Data

In [4]:
labelData = dfTickets[column_to_predict]
data = dfTickets[text_columns]

# Split dataset into training and testing data with 80:20 ratio
train_data, test_data, train_labels, test_labels = train_test_split(data, labelData, test_size=0.2)

# Extract Features from the Text

In [5]:
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        stemmer = SnowballStemmer("english", ignore_stopwords=True)
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

## Count Vectorizer

In [6]:
# Count Vectorizer = tokenizer
# Convert a collection of text documents to a matrix of token counts
if remove_stop_words:
    count_vect = CountVectorizer(stop_words=stop_words_lang)
elif use_stemming:
    count_vect = StemmedCountVectorizer(stop_words=stop_words_lang)
else:
    count_vect = CountVectorizer()

vectorized_data = count_vect.fit_transform(train_data)

## Use TF-IDF

In [7]:
tfidf = TfidfTransformer()
features = tfidf.fit_transform(vectorized_data)

# Use A Pipeline to Preprocess Data and Train Classifier

In [8]:
if classifier == "NB":
    clf = Pipeline([
        ('vect', count_vect),
        ('tfidf', TfidfTransformer()),
        ('samp',RandomOverSampler()),
        ('clf', MultinomialNB(fit_prior=fit_prior))
    ])
elif classifier == "DT":
    clf = Pipeline([
        ('vect', count_vect),
        ('tfidf', TfidfTransformer()),
        ('samp',RandomOverSampler()),
        ('clf', DecisionTreeClassifier())
    ])
elif classifier == "SVM":
    clf = Pipeline([
        ('vect', count_vect),
        ('tfidf', TfidfTransformer()),
        ('samp',RandomOverSampler()),
        ('clf', SVC(kernel='linear'))
    ])
clf = clf.fit(train_data, train_labels)

# Use GridSearchCV to Find the Best Params Set

In [9]:
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False)
}
# Create GS instance by passing the classifier, parameters and n_jobs=-1 (use multiple cores from user machine)
clf = GridSearchCV(clf, parameters, cv=10, n_jobs=-1)
clf = clf.fit(train_data, train_labels)

# Save Model to Pickle

In [10]:
pickle.dump(clf, open('./model/'+classifier+'_'+text_columns+'_'+column_to_predict+'_model.pickle',"wb"))

# Evaluate the Model

In [11]:
# Score and evaluate model on test data using model
predicted = clf.predict(test_data)

prediction_acc = np.mean(predicted == test_labels)
print("Confusion:")
print(metrics.confusion_matrix(test_labels, predicted))
print("Mean: " + str(prediction_acc))

Confusion:
[[ 12   7  21]
 [ 13  21  42]
 [ 29  13 158]]
Mean: 0.6044303797468354


# Print the Classification report

In [12]:
from sklearn.metrics import classification_report
print(classification_report(test_labels, predicted,
                            target_names=np.unique(test_labels)))

              precision    recall  f1-score   support

           1       0.22      0.30      0.26        40
           2       0.51      0.28      0.36        76
           3       0.71      0.79      0.75       200

    accuracy                           0.60       316
   macro avg       0.48      0.46      0.45       316
weighted avg       0.60      0.60      0.59       316

