In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np

In [2]:
df=pd.read_excel('Automatic_Ticket_Assignment.xlsx')
df.columns=['ShortDescription','Description', 'Caller', 'AssignmentGroup']

In [3]:
group_counts = df['AssignmentGroup'].value_counts()
to_remove = group_counts[group_counts < 200].index
df = df[~df['AssignmentGroup'].isin(to_remove)]

In [4]:
df['Description'] = df.apply(lambda row: row['ShortDescription'] if pd.isna(row['Description']) else row['Description'], axis=1)
df['ShortDescription'] = df.apply(lambda row: row['Description'] if pd.isna(row['ShortDescription']) else row['ShortDescription'], axis=1)

In [5]:
import contractions

def apply_contractions(text):
    new_phrase = []
    for word in text.split():
        new_phrase.append(contractions.fix(word))
        
    return ' '.join(new_phrase)
        
# Expanding Contractions in the reviews
df['Description']=df['Description'].apply(lambda x:apply_contractions(x))
df['ShortDescription']=df['ShortDescription'].apply(lambda x:apply_contractions(x))

In [6]:
import nltk
from nltk.corpus import stopwords
import re

stop_words = set(stopwords.words('english'))

def text_cleaner(text):
    #converting to lowercase
    newString = text.lower()
    #removing links
    newString = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', newString) 
    #fetching alphabetic characters
    newString = re.sub("[^a-zA-Z]", " ", newString)
    #removing stop words
    tokens = [w for w in newString.split() if not w in stop_words] 
    long_words=[]
    for i in tokens:
        #removing short words
        if len(i)>1:                                                 
            long_words.append(i)   
    return (" ".join(long_words)).strip()

cleaned_text=[]
for i in df['Description']:
    cleaned_text.append(text_cleaner(i))

df['Cleaned_Description'] = cleaned_text

cleaned_text=[]
for i in df['ShortDescription']:
    cleaned_text.append(text_cleaner(i))

df['ShortDescription'] = cleaned_text

In [7]:
df['Consolidated_Text']=df['Cleaned_Description'] + ' ' + df['ShortDescription']

In [8]:
from nltk.tokenize import word_tokenize

# function to genarate word tokens for tokenizers

def tokenization_func(text):
        return word_tokenize(text)

list_for_sentence_word_tokens = []

for sen in df.Consolidated_Text:
    list_for_sentence_word_tokens.append(tokenization_func(sen))

In [9]:
# Part Of Speech Tagging

list_of_sen_with_part_of_speech_tagging = []
for sen_list in list_for_sentence_word_tokens:
    list_of_sen_with_part_of_speech_tagging.append(nltk.pos_tag(sen_list))

In [10]:
# Lemmatization

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# Map POS tags to wordnet tags
# This step is necessary because the lemmatizer requires WordNet tags instead of POS tags
wordnet_tags = {'N': wordnet.NOUN, 'V': wordnet.VERB, 'R': wordnet.ADV, 'J': wordnet.ADJ}

lemmatizer = WordNetLemmatizer()

# Normalize the words using lemmatization with the appropriate POS tags
list_of_lemmatized_sen = []
for sen_list in list_of_sen_with_part_of_speech_tagging:
    lemmas = []
    for word, pos in sen_list:
        if pos[0] in wordnet_tags:
            tag = wordnet_tags[pos[0]]
            lemma = lemmatizer.lemmatize(word, tag)
            lemmas.append(lemma)
        else:
            lemmas.append(word)

    # Join the lemmas back into a normalized sentence
    normalized_sentence = " ".join(lemmas)
    # insert the lemmatized(normalized_sentence) sentence in a new list called list_of_lemmatized_sen
    list_of_lemmatized_sen.append(normalized_sentence)

df['Consolidated_Text'] = list_of_lemmatized_sen

In [11]:
new_df = df[['Consolidated_Text','AssignmentGroup']]
new_df.columns=['text','label']

In [12]:
new_df.drop_duplicates(subset='text', keep='first', inplace=True)

In [13]:
from sklearn import preprocessing

def labelencoder(dataframe) : 
  label_encoder = preprocessing.LabelEncoder() 
  dataframe= label_encoder.fit_transform(dataframe)
  
  return dataframe

new_df['label'] = labelencoder(new_df['label'])

In [15]:
from gensim.models import Word2Vec

sentences = [line.split(' ') for line in new_df['text']]
word2vec = Word2Vec(sentences=sentences,min_count=1) # min_count=1 means that we are considering all the words in the corpus
# This file will be used later to load the embeddings into memory for training a neural network
# By default each word will be represented by a 100 dimensional vector
word2vec.wv.save_word2vec_format('word2vec_vector.txt')

# load the whole embedding
embeddings_index = dict()
f = open('word2vec_vector.txt')

for line in f:
	values = line.split()
	word = values[0]
	coefs = np.asarray(values[1:], dtype='float32') # converts the string vectors to float and store in a numpy array
	embeddings_index[word] = coefs # store the word and its corresponding vector in a dictionary
f.close()

print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 10193 word vectors.


In [25]:
from keras.callbacks import ReduceLROnPlateau
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Flatten, Bidirectional, Conv1D, MaxPooling1D, GRU
from keras.models import Model
import tensorflow as tf
from sklearn import metrics
from keras.utils import pad_sequences
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from imblearn.over_sampling import SMOTE
import pickle

maxlen = 100
numWords=10000
epochs = 20
batch_size = 128

In [26]:
class FeedModelData:
    
    X_test=[]
    y_test=[]
    embedding_matrix=[]

    # 1
    def getData(self, dataframe):

      X,y = self.tokenizeAndEmbedding(dataframe)
      X_train, X_test, y_train, y_test, X_Val, y_Val = self.splitData(X,y)      
 
      return X_train, X_test, y_train, y_test, X_Val, y_Val, self.embedding_matrix
  
    # 2
    def tokenizeAndEmbedding(self,dataframe):

      tokenizer,X = self.wordTokenizer(dataframe['text'])
      y = np.asarray(dataframe['label'])
      X = pad_sequences(X, maxlen = maxlen, padding='post', truncating='post') # ensure that all sequences have the same length                
      self.embedding_matrix = np.zeros((numWords+1, 100))

      for i,word in tokenizer.index_word.items():
        if i<numWords+1: # we are taking only the first 9000 words
          embedding_vector = embeddings_index.get(word)
          if embedding_vector is not None:
              self.embedding_matrix[i] = embedding_vector

      return X,y
    
    # 3
    def wordTokenizer(self, dataframe):
      tokenizer = Tokenizer(num_words=numWords, lower=True, split=' ', char_level=False)
      tokenizer.fit_on_texts(dataframe) # convert each word in the text into a unique integer ID
      dataframe = tokenizer.texts_to_sequences(dataframe) # transform each text in dataframe into a sequence of integer indices
      
      with open('tokenizer.pickle', 'wb') as handle:
          pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
      
      return tokenizer,dataframe

    # 4
    def splitData(self,X,y):

      X_train, self.X_test, y_train, self.y_test = train_test_split(X, y, test_size=0.2, random_state=10)     
      X_train, X_Val, y_train, y_Val = train_test_split(X_train, y_train, test_size=0.1, random_state=10)
      
      #k-fold cross validation
      # check classes count
             
      # smote = SMOTE(random_state=42)
      # X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)     
      # return X_train_resampled, self.X_test, y_train_resampled, self.y_test, X_Val, y_Val
      return X_train, self.X_test, y_train, self.y_test, X_Val, y_Val

In [27]:
class GRUModel:
    
    model= Model()
    
    def train(self, X_train, y_train, X_Val, y_Val, embedding_matrix, batch_size, epochs):
        
      input_layer = Input(shape=(maxlen,), dtype=tf.int64)
      embed = Embedding(numWords+1, output_dim=100, input_length=maxlen, weights=[embedding_matrix], trainable=True)(input_layer)
      
      gru = GRU(128, return_sequences=True)(embed)
      gru = GRU(64)(gru)
      
      dense = Dense(128, activation='relu')(gru)
      drop=Dropout(0.5)(dense)    
      
      out = Dense(len((pd.Series(y_train)).unique()), activation='softmax')(drop) 
      self.model = Model(input_layer, out)    
      self.model.compile(loss='sparse_categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), metrics=['accuracy'])
      self.model.summary()
  
      reduceLoss = ReduceLROnPlateau(monitor='val_loss', factor=0.8, patience=1, min_lr=0.0001)
      
      model_history = self.model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, callbacks=[reduceLoss], validation_data=(X_Val,y_Val))
  
      return model_history, self.model

In [28]:
class Prediction:
    
    def prediction(self, model, X_test, y_test):
      
      pred = model.predict(X_test)
      df_pred = pd.DataFrame(pred, columns=['tech1', 'tech2', 'tech3', 'tech4', 'tech5', 'tech6', 'tech7', 'tech8'])
      pred = [i.argmax() for i in pred]
      
      accuracy = metrics.accuracy_score(y_test, pred)
      precision = metrics.precision_score(y_test, pred, average='weighted')
      recall = metrics.recall_score(y_test, pred, average='weighted')
      f1score = metrics.f1_score(y_test, pred, average='weighted')
      
      print("Precision of Gated Recurrent Unit model: ", precision)
      print("Recall of Gated Recurrent Unit model: ", recall)
      print("F1-Score of Gated Recurrent Unit model: ", f1score)
      print("Accuracy of Gated Recurrent Unit model:", accuracy)
      
      return df_pred

In [29]:
# get x_train, x_test, y_train, y_test, x_Val, y_Val, embedding_matrix to feed the model
FeedModelData = FeedModelData()
x_train, x_test, y_train, y_test, x_Val, y_Val, embedding_matrix = FeedModelData.getData(new_df)

In [30]:
# Define and train the model
GRUModel = GRUModel()
model_history, model = GRUModel.train(x_train, y_train, x_Val, y_Val, embedding_matrix, batch_size, epochs)

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 100)]             0         
                                                                 
 embedding_2 (Embedding)     (None, 100, 100)          1000100   
                                                                 
 gru (GRU)                   (None, 100, 128)          88320     
                                                                 
 gru_1 (GRU)                 (None, 64)                37248     
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 8)                 1032

In [31]:
# test model on unseen data and get prediction array
Prediction = Prediction()
all_predictions = Prediction.prediction(model, x_test, y_test)

Precision of Gated Recurrent Unit model:  0.5478716179935692
Recall of Gated Recurrent Unit model:  0.6923076923076923
F1-Score of Gated Recurrent Unit model:  0.5855577716800268
Accuracy of Gated Recurrent Unit model: 0.6923076923076923


  _warn_prf(average, modifier, msg_start, len(result))


In [32]:
model.save('GRU_model.h5')

In [33]:
all_predictions.to_csv('GRU_model_predictions.csv', index=False)