In [None]:
import pandas as pd
import numpy as np

import re

import nltk
from nltk import DecisionTreeClassifier
from nltk import MaxentClassifier
from nltk import NaiveBayesClassifier
from nltk import pos_tag
from nltk import word_tokenize

from nltk.stem import WordNetLemmatizer 
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

import pickle


lemmatizer = WordNetLemmatizer() 
# sb = SnowballStemmer('english')
stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'us',
              'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves',
              'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself',
              'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves',
              'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those',
              'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having',
              'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or',
              'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about',
              'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to',
              'from', 'again', 'further', 'then', 'might', 'must', 'need', 'shall', 'once',
              'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few',
              'more', 'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than',
              'too', 'very', 's', 't', 'can', 'will', 'just', 'should', "should've", 'now',
              'd', 'll', 'm', 'o', 're', 've', 'y', 'ain']

### Read Data

In [None]:
dfTickets = pd.read_csv("all_tickets.csv",dtype=str)
# print(data[['title']])
print(dfTickets['title'])
# tickets_title = data[['title']]
# tickets_body = data[['body']]
# category = data[['category']]
# business_service = data[['business_service']]
# urgency = data[['urgency']]
# impact = data[['impact']]
# priority = data[['priority']]

### Tokenize + Lemmatize Data + Stopwords Removal

#### Title Data

print("title length:",len(tickets_title))
preprocessed_title = []
for i in range(0,43106):
    lemmatized = []
    word = tickets_title.loc[i,'title'].lower()
    tokenizeWord = word_tokenize(word)
    filtered_sentence = [w for w in tokenizeWord if not w in stop_words]
    filtered_sentence = []
    for w in tokenizeWord: 
        if w not in stop_words: 
            filtered_sentence.append(w)         

    for word in filtered_sentence:
        lemmatized.append(lemmatizer.lemmatize(word))
        
    preprocessed_title.append(lemmatized)    

print(preprocessed_title[1])

#### Body Data

print("body length:",len(tickets_body))
preprocessed_body = []
for i in range(0,43106):
    lemmatized = []
    word = tickets_body.loc[i,'body'].lower()
    tokenizeWord = word_tokenize(word)
    filtered_sentence = [w for w in tokenizeWord if not w in stop_words]
    filtered_sentence = []
    for w in tokenizeWord: 
        if w not in stop_words: 
            filtered_sentence.append(w)         

    for word in filtered_sentence:
        lemmatized.append(lemmatizer.lemmatize(word))
        
    preprocessed_body.append(lemmatized)   

print(preprocessed_body[1])

# Split Data

In [None]:
column_to_predict = "urgency"
text_columns = "title"
data = dfTickets[[text_columns]]
labelData = dfTickets[[column_to_predict]]

train_data, test_data, train_labels, test_labels = train_test_split(
    data, labelData, test_size=0.2
)

# Feature Set Extraction

#### Title / Urgency Feature

In [None]:
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        stemmer = SnowballStemmer("english", ignore_stopwords=True)
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

In [None]:
count_vect = StemmedCountVectorizer(stop_words="english")
vectorized_data = count_vect.fit_transform(train_data)
vectorized_data.shape

In [None]:
tfidf = TfidfTransformer()
features = tfidf.fit_transform(vectorized_data)
features.shape

In [None]:
text_clf = Pipeline([
        ('vect', count_vect),
        ('tfidf', TfidfTransformer()),
        ('clf', MultinomialNB(fit_prior=True))
    ])
text_clf = text_clf.fit(train_data, train_labels)

In [None]:
print("Evaluating model")
# Score and evaluate model on test data using model without hyperparameter tuning
predicted = text_clf.predict(test_data)
prediction_acc = np.mean(predicted == test_labels)
print("Confusion matrix without GridSearch:")
print(metrics.confusion_matrix(test_labels, predicted))
print("Mean without GridSearch: " + str(prediction_acc))

# Score and evaluate model on test data using model WITH hyperparameter tuning
if use_grid_search:
    predicted = gs_clf.predict(test_data)
    prediction_acc = np.mean(predicted == test_labels)
    print("Confusion matrix with GridSearch:")
    print(metrics.confusion_matrix(test_labels, predicted))
    print("Mean with GridSearch: " + str(prediction_acc))

In [None]:
# Ploting confusion matrix with 'seaborn' module
# Use below line only with Jupyter Notebook
%matplotlib inline
import seaborn as sns
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import matplotlib
mat = confusion_matrix(test_labels, predicted)
plt.figure(figsize=(4, 4))
sns.set()
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False,
            xticklabels=np.unique(test_labels),
            yticklabels=np.unique(test_labels))
plt.xlabel('true label')
plt.ylabel('predicted label')
# Save confusion matrix to outputs in Workbench
# plt.savefig(os.path.join('.', 'outputs', 'confusion_matrix.png'))
plt.show()

In [None]:
%matplotlib inline
from sklearn.metrics import classification_report
print(classification_report(test_labels, predicted,
                            target_names=np.unique(test_labels)))