In [1]:
import pandas as pd
import numpy as np

import re

import nltk
from nltk import DecisionTreeClassifier
from nltk import MaxentClassifier
from nltk import NaiveBayesClassifier
from nltk import pos_tag
from nltk import word_tokenize

from nltk.stem import WordNetLemmatizer 
# from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

import pickle

count_vect = CountVectorizer(stop_words="english")
lemmatizer = WordNetLemmatizer() 
# sb = SnowballStemmer('english')
stop_words = ['i', 'me', 'mine', 'he', 'she', 'it', 'a', 'an', 'the',
             'above', 'below', 'while', 'as', 'until', 'of', 'at',
             'if', 'to', 'or', 'was', 'were', 'itself', 'for',
             'other', 'both', 'any', 'all', 'between', 'do', 'does',
             'did', 'on', 'own', 'who', 'whom', 'this', 'that', 'has',
             'have', 'here', 'some', 'why', 'same',
             'so', 'is', 'be']

### Read Data

In [2]:
data = pd.read_csv("all_tickets.csv")
tickets_title = data[['title']]
tickets_body = data[['body']]
category = data[['category']]
business_service = data[['business_service']]
urgency = data[['urgency']]
impact = data[['impact']]
priority = data[['priority']]

### Tokenize + Lemmatize Data + Stopwords Removal

#### Title Data

In [4]:
print("title length:",len(tickets_title))
preprocessed_title = []
for i in range(0,43106):
    lemmatized = []
    word = tickets_title.loc[i,'title']
    tokenizeWord = word_tokenize(word)
    filtered_sentence = [w for w in tokenizeWord if not w in stop_words]
    filtered_sentence = []
    for w in tokenizeWord: 
        if w not in stop_words: 
            filtered_sentence.append(w)         

    for word in filtered_sentence:
        lemmatized.append(lemmatizer.lemmatize(word))
        
    preprocessed_title.append(lemmatized)    

title length: 43107


In [5]:
print(preprocessed_title[30020])

['call', 'action', 'add', 'user', 'project']


#### Body Data

In [7]:
print("body length:",len(tickets_body))
preprocessed_body = []
for i in range(0,43106):
    lemmatized = []
    word = tickets_body.loc[i,'body']
    tokenizeWord = word_tokenize(word)
    filtered_sentence = [w for w in tokenizeWord if not w in stop_words]
    filtered_sentence = []
    for w in tokenizeWord: 
        if w not in stop_words: 
            filtered_sentence.append(w)         

    for word in filtered_sentence:
        lemmatized.append(lemmatizer.lemmatize(word))
        
    preprocessed_body.append(lemmatized)   

body length: 43107


In [8]:
print(preprocessed_body[30097])

['friday', 'november', 'pm', 're', 'sorry', 'update', 'den', 'plan', 'pet', 'thanks', 'friday', 'november', 'pm', 'dear', 'please', 'add', 'confluence', 'also', 'please', 'let', 'different', 'configuration', 'tower', 'office', 'thanks', 'testing', 'discipline', 'lead']


# Split Data

In [9]:
titleData = data[["title"]]
bodyData = data[["body"]]
urgencyLabelData = urgency
impactLabelData = impact

# print(len(preprocessed_title))
# print(len(preprocessed_body))
# print(len(urgency))
# print(len(impact))

#TU = Title / Urgency
#TI = Title / Impact
#BU = Body / Urgency
#BI = Body / Impact

train_data_TU, test_data_TU, train_labels_TU, test_labels_TU = train_test_split(
    titleData, urgencyLabelData, test_size=0.2
)

train_data_TI, test_data_TI, train_labels_TI, test_labels_TI = train_test_split(
    titleData, impactLabelData, test_size=0.2
) 

train_data_BU, test_data_BU, train_labels_BU, test_labels_BU = train_test_split(
    bodyData, urgencyLabelData, test_size=0.2
) 

train_data_BI, test_data_BI, train_labels_BI, test_labels_BI = train_test_split(
    bodyData, impactLabelData, test_size=0.2
) 

# Feature Set Extraction

#### Title / Urgency Feature

In [10]:
vectorized_data = count_vect.fit_transform(train_data_TU)
vectorized_data.shape
tfidf = TfidfTransformer()
features = tfidf.fit_transform(vectorized_data)
features.shape

(1, 1)

In [11]:
print(len(train_data_TU))
print(len(train_labels_TU))
train_data = train_data_TU
train_labels = train_labels_TU

34485
34485


In [12]:
text_clf = Pipeline([
        ('vect', count_vect),
        ('tfidf', TfidfTransformer()),
        ('clf', MultinomialNB(fit_prior=True))
    ])
text_clf = text_clf.fit(train_data_TU, train_labels_TU)

  y = column_or_1d(y, warn=True)


ValueError: Found input variables with inconsistent numbers of samples: [1, 34485]

In [None]:
print("Evaluating model")
# Score and evaluate model on test data using model without hyperparameter tuning
predicted = text_clf.predict(test_data)
prediction_acc = np.mean(predicted == test_labels)
print("Confusion matrix without GridSearch:")
print(metrics.confusion_matrix(test_labels, predicted))
print("Mean without GridSearch: " + str(prediction_acc))

# Score and evaluate model on test data using model WITH hyperparameter tuning
if use_grid_search:
    predicted = gs_clf.predict(test_data)
    prediction_acc = np.mean(predicted == test_labels)
    print("Confusion matrix with GridSearch:")
    print(metrics.confusion_matrix(test_labels, predicted))
    print("Mean with GridSearch: " + str(prediction_acc))

In [None]:
# Ploting confusion matrix with 'seaborn' module
# Use below line only with Jupyter Notebook
%matplotlib inline
import seaborn as sns
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import matplotlib
mat = confusion_matrix(test_labels, predicted)
plt.figure(figsize=(4, 4))
sns.set()
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False,
            xticklabels=np.unique(test_labels),
            yticklabels=np.unique(test_labels))
plt.xlabel('true label')
plt.ylabel('predicted label')
# Save confusion matrix to outputs in Workbench
# plt.savefig(os.path.join('.', 'outputs', 'confusion_matrix.png'))
plt.show()

In [None]:
%matplotlib inline
from sklearn.metrics import classification_report
print(classification_report(test_labels, predicted,
                            target_names=np.unique(test_labels)))