In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np

In [2]:
df=pd.read_excel('Automatic_Ticket_Assignment.xlsx')
df.columns=['ShortDescription','Description', 'Caller', 'AssignmentGroup']

In [3]:
group_counts = df['AssignmentGroup'].value_counts()
to_remove = group_counts[group_counts < 200].index
df = df[~df['AssignmentGroup'].isin(to_remove)]

In [4]:
df['Description'] = df.apply(lambda row: row['ShortDescription'] if pd.isna(row['Description']) else row['Description'], axis=1)
df['ShortDescription'] = df.apply(lambda row: row['Description'] if pd.isna(row['ShortDescription']) else row['ShortDescription'], axis=1)

In [5]:
import contractions

def apply_contractions(text):
    new_phrase = []
    for word in text.split():
        new_phrase.append(contractions.fix(word))
        
    return ' '.join(new_phrase)
        
# Expanding Contractions in the reviews
df['Description']=df['Description'].apply(lambda x:apply_contractions(x))
df['ShortDescription']=df['ShortDescription'].apply(lambda x:apply_contractions(x))

In [6]:
import nltk
from nltk.corpus import stopwords
import re

stop_words = set(stopwords.words('english'))

def text_cleaner(text):
    #converting to lowercase
    newString = text.lower()
    #removing links
    newString = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', newString) 
    #fetching alphabetic characters
    newString = re.sub("[^a-zA-Z]", " ", newString)
    #removing stop words
    tokens = [w for w in newString.split() if not w in stop_words] 
    long_words=[]
    for i in tokens:
        #removing short words
        if len(i)>1:                                                 
            long_words.append(i)   
    return (" ".join(long_words)).strip()

cleaned_text=[]
for i in df['Description']:
    cleaned_text.append(text_cleaner(i))

df['Cleaned_Description'] = cleaned_text

cleaned_text=[]
for i in df['ShortDescription']:
    cleaned_text.append(text_cleaner(i))

df['ShortDescription'] = cleaned_text

In [7]:
df['Consolidated_Text']=df['Cleaned_Description'] + ' ' + df['ShortDescription']

In [8]:
from nltk.tokenize import word_tokenize

# function to genarate word tokens for tokenizers

def tokenization_func(text):
        return word_tokenize(text)

list_for_sentence_word_tokens = []

for sen in df.Consolidated_Text:
    list_for_sentence_word_tokens.append(tokenization_func(sen))

In [9]:
# Part Of Speech Tagging

list_of_sen_with_part_of_speech_tagging = []
for sen_list in list_for_sentence_word_tokens:
    list_of_sen_with_part_of_speech_tagging.append(nltk.pos_tag(sen_list))

In [10]:
# Lemmatization

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# Map POS tags to wordnet tags
# This step is necessary because the lemmatizer requires WordNet tags instead of POS tags
wordnet_tags = {'N': wordnet.NOUN, 'V': wordnet.VERB, 'R': wordnet.ADV, 'J': wordnet.ADJ}

lemmatizer = WordNetLemmatizer()

# Normalize the words using lemmatization with the appropriate POS tags
list_of_lemmatized_sen = []
for sen_list in list_of_sen_with_part_of_speech_tagging:
    lemmas = []
    for word, pos in sen_list:
        if pos[0] in wordnet_tags:
            tag = wordnet_tags[pos[0]]
            lemma = lemmatizer.lemmatize(word, tag)
            lemmas.append(lemma)
        else:
            lemmas.append(word)

    # Join the lemmas back into a normalized sentence
    normalized_sentence = " ".join(lemmas)
    # insert the lemmatized(normalized_sentence) sentence in a new list called list_of_lemmatized_sen
    list_of_lemmatized_sen.append(normalized_sentence)

df['Consolidated_Text'] = list_of_lemmatized_sen

In [11]:
new_df = df[['Consolidated_Text','AssignmentGroup']]
new_df.columns=['text','label']

In [12]:
new_df.drop_duplicates(subset='text', keep='first', inplace=True)

In [12]:
from sklearn import preprocessing

def labelencoder(dataframe) : 
  label_encoder = preprocessing.LabelEncoder() 
  dataframe= label_encoder.fit_transform(dataframe)
  
  return dataframe

new_df['label'] = labelencoder(new_df['label'])

In [23]:
new_df['label'].value_counts()

label
0    3976
6     661
4     289
1     257
7     252
3     241
2     215
5     200
Name: count, dtype: int64

In [13]:
from gensim.models import Word2Vec

sentences = [line.split(' ') for line in new_df['text']]
word2vec = Word2Vec(sentences=sentences,min_count=1) # min_count=1 means that we are considering all the words in the corpus
# This file will be used later to load the embeddings into memory for training a neural network
# By default each word will be represented by a 100 dimensional vector
word2vec.wv.save_word2vec_format('word2vec_vector.txt')

# load the whole embedding
embeddings_index = dict()
f = open('word2vec_vector.txt')

for line in f:
	values = line.split()
	word = values[0]
	coefs = np.asarray(values[1:], dtype='float32') # converts the string vectors to float and store in a numpy array
	embeddings_index[word] = coefs # store the word and its corresponding vector in a dictionary
f.close()

print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 10193 word vectors.


In [14]:
from sklearn.model_selection import train_test_split

_1, X_test, _2, y_test = train_test_split(new_df['text'], new_df['label'], test_size=0.2, random_state=10)     
_1, X_Val, _2, y_Val = train_test_split(_1, _2, test_size=0.1, random_state=10)

In [None]:
# Convert X_train and y_train into DataFrames
train_df = pd.DataFrame({'text': _1, 'label': _2})

In [None]:
import nlpaug.augmenter.word as naw


label_counts = new_df['label'].value_counts()

# Find the majority class (class with the highest count)
majority_class = label_counts.idxmax()

# Calculate the augmentation ratio for each class
augmentation_ratio = {}
for label, count in label_counts.items():
    if label != majority_class:
        ratio = int(label_counts[majority_class] / count)
        augmentation_ratio[label] = ratio
    else:
        continue


# Initialize the augmenter
augmenter = naw.ContextualWordEmbsAug(
    model_path='bert-base-uncased', action="insert"
)

import time
count = 0

new_train_df = train_df[~train_df['label'].isin([majority_class])]

# Augment the data
for i, (label, text) in enumerate(zip(new_train_df['label'], new_train_df['text'])):
    start_time = time.time()
    count += 1
    print("we are at index : ", count)
    augmented_text = text
    
    if label in augmentation_ratio and count <= new_train_df.count():
        ratio = augmentation_ratio[label]
        augmented_text = augmenter.augment(augmented_text, n=int(ratio))
        new_row = pd.DataFrame({'text': augmented_text, 'label': label})
        train_df = pd.concat([train_df, new_row], ignore_index=True)
        
    end_time = time.time()
    execution_time = end_time - start_time
    print("Average time per augmented sentence:", execution_time)

In [None]:
train_df.to_csv('Augmented_train_df.csv', index=False)

In [15]:
# We'll use this one for demo because generating augmented text takes too much time
train_df = pd.read_csv('Augmented_train_df.csv')

In [None]:
train_df['label'].value_counts()

In [18]:
import keras
from keras.callbacks import ReduceLROnPlateau
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Flatten, Bidirectional, Conv1D, MaxPooling1D
from keras.models import Model
import tensorflow as tf
from sklearn import metrics
from sklearn.metrics import classification_report
from keras.utils import pad_sequences
from keras.preprocessing.text import Tokenizer
import pickle

maxlen = 128
numWords=10000
epochs = 20
batch_size = 128

In [19]:
tokenizer = Tokenizer(num_words=numWords, lower=True, split=' ', char_level=False)
tokenizer.fit_on_texts(new_df['text'])

with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [20]:
X_train = tokenizer.texts_to_sequences(train_df['text'])
X_test = tokenizer.texts_to_sequences(X_test)
X_Val = tokenizer.texts_to_sequences(X_Val)

In [21]:
X_train = pad_sequences(X_train, maxlen = maxlen, padding='post', truncating='post')
X_test = pad_sequences(X_test, maxlen = maxlen, padding='post', truncating='post')
X_Val = pad_sequences(X_Val, maxlen = maxlen, padding='post', truncating='post')

In [22]:
y_train = np.asarray(train_df['label'])
y_test = np.asarray(y_test)
y_Val = np.asarray(y_Val)

In [24]:
embedding_matrix=[]

embedding_matrix = np.zeros((numWords+1, 100))

for i,word in tokenizer.index_word.items():
  if i<numWords+1: # we are taking only the first 9000 words
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
import optuna

class CNNModel:

    def objective(self, trial):
        # Define the search space for hyperparameters
        learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-2, log=True)
        dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)
        num_filters = trial.suggest_int('num_filters', 128, 512)
        filters_size = trial.suggest_int('filters_size', 1, 30)
        pool_size = trial.suggest_int('pool_size', 1, 30)
        num_dense_units = trial.suggest_int('num_dense_units', 32, 512)

        # Create the Keras model with the specified hyperparameters       
        input_layer = Input(shape=(maxlen,), dtype=tf.int64)
        embed = Embedding(numWords+1, output_dim=100, input_length=maxlen, weights=[embedding_matrix], trainable=True)(input_layer)
        conv = Conv1D(num_filters, filters_size, activation='relu')(embed)
        max_pool = MaxPooling1D(pool_size=pool_size)(conv)
        flatten = Flatten()(max_pool)
        dense = Dense(num_dense_units, activation='relu')(flatten)
        drop=Dropout(dropout_rate)(dense)    
        out = Dense(len((pd.Series(y_train)).unique()), activation='softmax')(drop)
         
        model = Model(input_layer, out)
            
        model.compile(loss='sparse_categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), metrics=['accuracy'])
  
        model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=(X_Val, y_Val), verbose=0)
        val_accuracy = model.evaluate(X_Val, y_Val, verbose=0)[1]
        
        return val_accuracy
   
    def train(self, batch_size, epochs):
        # Create an Optuna study and run the hyperparameter search
        study = optuna.create_study(direction='maximize')
        study.optimize(self.objective, n_trials=30)

        # Get the best hyperparameters and train the final model
        best_params = study.best_params
        best_learning_rate = best_params['learning_rate']
        best_dropout_rate = best_params['dropout_rate']
        best_num_filters = best_params['num_filters']
        best_filters_size = best_params['filters_size']
        best_pool_size = best_params['pool_size']
        best_num_dense_units = best_params['num_dense_units']

        input_layer = Input(shape=(maxlen,), dtype=tf.int64)
        embed = Embedding(numWords+1, output_dim=100, input_length=maxlen, weights=[embedding_matrix], trainable=True)(input_layer)
        conv = Conv1D(best_num_filters, best_filters_size, activation='relu')(embed)
        max_pool = MaxPooling1D(pool_size=best_pool_size)(conv)
        flatten = Flatten()(max_pool)
        dense = Dense(best_num_dense_units, activation='relu')(flatten)
        drop=Dropout(best_dropout_rate)(dense)    
        out = Dense(len((pd.Series(y_train)).unique()), activation='softmax')(drop)
        
        model = Model(input_layer, out)
         
        model.compile(loss='sparse_categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=best_learning_rate), metrics=['accuracy'])
        model.summary()
   
        # reduceLoss = ReduceLROnPlateau(monitor='val_loss', factor=0.8, patience=1, min_lr=0.0001)
        reduceLoss = ReduceLROnPlateau(monitor='val_loss', factor=0.8, patience=1)
   
        model_history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, callbacks=[reduceLoss], validation_data=(X_Val,y_Val))

        return model_history, model

In [26]:
class Prediction:
    
    def prediction(self, model, X_test, y_test):
      
      pred = model.predict(X_test)
      df_pred = pd.DataFrame(pred, columns=['tech1', 'tech2', 'tech3', 'tech4', 'tech5', 'tech6', 'tech7', 'tech8'])
      pred = [i.argmax() for i in pred]
      
      accuracy = metrics.accuracy_score(y_test, pred)
      precision = metrics.precision_score(y_test, pred, average='weighted')
      recall = metrics.recall_score(y_test, pred, average='weighted')
      f1score = metrics.f1_score(y_test, pred, average='weighted')
      
      print("Precision of Convolutional Neural Network model: ", precision)
      print("Recall of Convolutional Neural Network model: ", recall)
      print("F1-Score of Convolutional Neural Network model: ", f1score)
      print("Accuracy of Convolutional Neural Network model:", accuracy)
      
      print(classification_report(y_test, pred))
      
      return df_pred

In [27]:
CNNModel = CNNModel()
model_history, model = CNNModel.train(X_train, y_train, X_Val, y_Val, embedding_matrix, batch_size, epochs)

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 128)]             0         
                                                                 
 embedding (Embedding)       (None, 128, 100)          1000100   
                                                                 
 bidirectional (Bidirectiona  (None, 256)              234496    
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense (Dense)               (None, 100)               25700     
                                                                 
 dense_1 (Dense)             (None, 8)                 808       
                                                           

In [28]:
# test model on unseen data and get prediction array
Prediction = Prediction()
all_predictions = Prediction.prediction(model, X_test, y_test)

Precision of Bi-Directional Long Short Term Memory model:  0.9292207646425427
Recall of Bi-Directional Long Short Term Memory model:  0.9302707136997539
F1-Score of Bi-Directional Long Short Term Memory model:  0.9261571231290466
Accuracy of Bi-Directional Long Short Term Memory model: 0.9302707136997539
              precision    recall  f1-score   support

           0       0.97      0.98      0.98       798
           1       0.96      0.90      0.92        48
           2       0.80      0.87      0.84        38
           3       0.84      0.81      0.83        47
           4       1.00      1.00      1.00        59
           5       0.87      0.80      0.84        41
           6       0.79      0.94      0.86       133
           7       0.75      0.38      0.51        55

    accuracy                           0.93      1219
   macro avg       0.87      0.83      0.85      1219
weighted avg       0.93      0.93      0.93      1219



In [32]:
model.save('CNN_model_BERT_OPTUNA.h5')

In [None]:
all_predictions.to_csv('CNN_predictions.csv', index=False)