In [17]:
#importing necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
plt.style.use('fivethirtyeight')

from tqdm import tqdm
import time

import string
import nltk
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from wordcloud import WordCloud, STOPWORDS
import re
from nltk.tokenize import WordPunctTokenizer

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
nltk.download('wordnet') 
from nltk.stem.wordnet import WordNetLemmatizer
import pickle  

#Importing Sklearn function for splitting dataset for training and testing
from sklearn.model_selection  import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import OneHotEncoder
from sklearn.manifold import TSNE

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Amreen\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


**Loading the Dataset**

In [6]:
data = pd.read_csv("Input Data Synthetic .csv")

In [7]:
#Have a look at first five rows in dataset
data.head()

Unnamed: 0,Short description,Description,Caller,Assignment group
0,login issue,-verified user details.(employee# & manager na...,spxjnwir pjlcoqds,GRP_0
1,outlook,\r\n\r\nreceived from: hmjdrvpb.komuaywn@gmail...,hmjdrvpb komuaywn,GRP_0
2,cant log in to vpn,\r\n\r\nreceived from: eylqgodm.ybqkwiam@gmail...,eylqgodm ybqkwiam,GRP_0
3,unable to access hr_tool page,unable to access hr_tool page,xbkucsvz gcpydteq,GRP_0
4,skype error,skype error,owlgqjme qhcozdfx,GRP_0


In [8]:
#number of rows and columns
data.shape

(8500, 4)

**Description - There are 8500 observations and 4 features**

In [6]:
#Datatypes of each attribute
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8500 entries, 0 to 8499
Data columns (total 4 columns):
Short description    8498 non-null object
Description          8499 non-null object
Caller               8500 non-null object
Assignment group     8500 non-null object
dtypes: object(4)
memory usage: 265.7+ KB


**Description -All the 4 attributes are of object type**

In [7]:
data.isnull().sum()

Short description    2
Description          1
Caller               0
Assignment group     0
dtype: int64

**Description - There are null values in 'Short description' and 'Description' columns.**

In [8]:
#Dropping the column Caller as it is insignificant for our model
df = data.drop(['Caller'],axis = 1)

In [9]:
#df.head(10)

In [10]:
df['Assignment group'].unique()

array(['GRP_0', 'GRP_1', 'GRP_3', 'GRP_4', 'GRP_5', 'GRP_6', 'GRP_7',
       'GRP_8', 'GRP_9', 'GRP_10', 'GRP_11', 'GRP_12', 'GRP_13', 'GRP_14',
       'GRP_15', 'GRP_16', 'GRP_17', 'GRP_18', 'GRP_19', 'GRP_2',
       'GRP_20', 'GRP_21', 'GRP_22', 'GRP_23', 'GRP_24', 'GRP_25',
       'GRP_26', 'GRP_27', 'GRP_28', 'GRP_29', 'GRP_30', 'GRP_31',
       'GRP_33', 'GRP_34', 'GRP_35', 'GRP_36', 'GRP_37', 'GRP_38',
       'GRP_39', 'GRP_40', 'GRP_41', 'GRP_42', 'GRP_43', 'GRP_44',
       'GRP_45', 'GRP_46', 'GRP_47', 'GRP_48', 'GRP_49', 'GRP_50',
       'GRP_51', 'GRP_52', 'GRP_53', 'GRP_54', 'GRP_55', 'GRP_56',
       'GRP_57', 'GRP_58', 'GRP_59', 'GRP_60', 'GRP_61', 'GRP_32',
       'GRP_62', 'GRP_63', 'GRP_64', 'GRP_65', 'GRP_66', 'GRP_67',
       'GRP_68', 'GRP_69', 'GRP_70', 'GRP_71', 'GRP_72', 'GRP_73'],
      dtype=object)

In [11]:
#Fetch wordcount for each entry in short description
df['word_count_short_dec'] = df['Short description'].apply(lambda x: len(str(x).split(" ")))
df[['Short description','word_count_short_dec']].head()

Unnamed: 0,Short description,word_count_short_dec
0,login issue,2
1,outlook,1
2,cant log in to vpn,5
3,unable to access hr_tool page,5
4,skype error,3


In [12]:
##Descriptive statistics of word counts
df.word_count_short_dec.describe()

count    8500.000000
mean        7.311765
std         4.325907
min         1.000000
25%         4.000000
50%         6.000000
75%        10.000000
max        41.000000
Name: word_count_short_dec, dtype: float64

**Description :
The average word count is about 7 words per short description. The word count ranges from a minimum of 1 to a maximum of 41. The word count is important to give us an indication of the size of the dataset that we are handling as well as the variation in word counts across the rows.**

In [13]:
#maxlen1 = 41

In [14]:
#Fetch wordcount for each entry in short description
df['word_count_description'] = df['Description'].apply(lambda x: len(str(x).split(" ")))
df[['Description','word_count_description']].head()

Unnamed: 0,Description,word_count_description
0,-verified user details.(employee# & manager na...,29
1,\r\n\r\nreceived from: hmjdrvpb.komuaywn@gmail...,23
2,\r\n\r\nreceived from: eylqgodm.ybqkwiam@gmail...,9
3,unable to access hr_tool page,5
4,skype error,3


In [15]:
df.word_count_description.describe()

count    8500.000000
mean       28.877294
std        69.234356
min         1.000000
25%         6.000000
50%        11.000000
75%        26.000000
max      1417.000000
Name: word_count_description, dtype: float64

**Description: The average word count is about 29 words per description. The word count ranges from a minimum of 1 to a maximum of 1417.**

In [16]:
maxlen = 1417

In [17]:
##Creating a list of stop words 
stop_words = set(stopwords.words("english"))



In [18]:
corpus = []
for i in range(0, 8500):
    #Remove punctuations
    text = re.sub('[^a-zA-Z]', ' ', str(df['Description'][i]))
    text = re.sub(r"[!\"#$%&\'()*+,-./:;<=>?@^_`{|}~\[\]]"," ",text) # removing all punctuations
    
    #Convert to lowercase
    text = text.lower()
    
    #remove tags
    text=re.sub("&lt;/?.:-_*?&gt;@"," &lt;&gt; ",text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    text = re.sub(r"[0-9]","",text) # removing numbers
    
    ##Convert to list from string
    text = text.split()
    

    #Lemmatisation
    lem = WordNetLemmatizer()
    text = [lem.lemmatize(word) for word in text if not word in  
            stop_words] 
    text = " ".join(text)
    corpus.append(text)

In [19]:
corpus[2]

'received eylqgodm ybqkwiam gmail com hi cannot log vpn best'

In [20]:
corpus2 = []
for i in range(0, 8500):
    #Remove punctuations
    text = re.sub('[^a-zA-Z]', ' ', str(df['Short description'][i]))
    #text = re.sub(r"[!\"#$%&\'()*+,-./:;<=>?@^_`{|}~\[\]]"," ",text) # removing all punctuations
    
    #Convert to lowercase
    text = text.lower()
    
    #remove tags
    text=re.sub("&lt;/?.:-_*?&gt;@"," &lt;&gt; ",text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    text = re.sub(r"[0-9]","",text) # removing numbers
    
    ###Convert to list from string
    #text = text.split()
    corpus2.append(text)

In [21]:
corpus2[2]

'cant log in to vpn'

**Dropping the rows conatining missing values**

In [22]:
data = data.dropna(axis=0, how='any',inplace=False)
print(data.isnull().sum())

Short description    0
Description          0
Caller               0
Assignment group     0
dtype: int64


In [23]:
df_cleaned_text = pd.DataFrame(df)

In [24]:
df_cleaned_text.head()

Unnamed: 0,Short description,Description,Assignment group,word_count_short_dec,word_count_description
0,login issue,-verified user details.(employee# & manager na...,GRP_0,2,29
1,outlook,\r\n\r\nreceived from: hmjdrvpb.komuaywn@gmail...,GRP_0,1,23
2,cant log in to vpn,\r\n\r\nreceived from: eylqgodm.ybqkwiam@gmail...,GRP_0,5,9
3,unable to access hr_tool page,unable to access hr_tool page,GRP_0,5,5
4,skype error,skype error,GRP_0,3,3


In [25]:
df_cleaned_text['Short_Descrption'] = corpus2

In [26]:
df_cleaned_text['DESCRIPTION'] = corpus

In [27]:
df_final = df_cleaned_text.drop(['Short description','Description','word_count_description',
                                        'word_count_short_dec'], axis =1)

In [28]:
df_final.rename(columns = {'Assignment group':'Target'},inplace = True)
 

In [29]:
df_final.head()

Unnamed: 0,Target,Short_Descrption,DESCRIPTION
0,GRP_0,login issue,verified user detail employee manager name che...
1,GRP_0,outlook,received hmjdrvpb komuaywn gmail com hello tea...
2,GRP_0,cant log in to vpn,received eylqgodm ybqkwiam gmail com hi cannot...
3,GRP_0,unable to access hr tool page,unable access hr tool page
4,GRP_0,skype error,skype error


In [31]:
df_final['Description'] =  df_final['Short_Descrption'].str.cat(df_final['DESCRIPTION'], sep =" ") 
df_final.head()


Unnamed: 0,Target,Short_Descrption,DESCRIPTION,Description
0,GRP_0,login issue,verified user detail employee manager name che...,login issue verified user detail employee mana...
1,GRP_0,outlook,received hmjdrvpb komuaywn gmail com hello tea...,outlook received hmjdrvpb komuaywn gmail com h...
2,GRP_0,cant log in to vpn,received eylqgodm ybqkwiam gmail com hi cannot...,cant log in to vpn received eylqgodm ybqkwiam ...
3,GRP_0,unable to access hr tool page,unable access hr tool page,unable to access hr tool page unable access hr...
4,GRP_0,skype error,skype error,skype error skype error


In [32]:
df_final = df_final.drop(['Short_Descrption','DESCRIPTION'],axis = 1)
df_final.head(5)

Unnamed: 0,Target,Description
0,GRP_0,login issue verified user detail employee mana...
1,GRP_0,outlook received hmjdrvpb komuaywn gmail com h...
2,GRP_0,cant log in to vpn received eylqgodm ybqkwiam ...
3,GRP_0,unable to access hr tool page unable access hr...
4,GRP_0,skype error skype error


In [34]:
df_final.word_count2.describe()

count    8500.000000
mean       27.496118
std        48.221067
min         2.000000
25%        11.000000
50%        18.000000
75%        29.000000
max      1266.000000
Name: word_count2, dtype: float64

In [35]:
maxlen = 1266

In [36]:
from nltk.probability import FreqDist
text = str(df_final['Description'])
fdist = FreqDist()
for word in word_tokenize(text):
    fdist[word.lower()] += 1
fdist.most_common(50)

[('...', 45),
 ('to', 29),
 ('unable', 23),
 ('tool', 11),
 ('ticket', 11),
 ('received', 10),
 ('vpn', 10),
 ('hr', 10),
 ('update', 9),
 ('erp', 8),
 ('account', 8),
 ('login', 7),
 ('user', 7),
 ('on', 6),
 ('not', 6),
 ('sid', 6),
 ('locked', 6),
 ('in', 5),
 ('issue', 4),
 ('outlook', 4),
 ('log', 4),
 ('access', 4),
 ('hostname', 4),
 ('inplant', 4),
 ('reset', 4),
 ('password', 4),
 ('pc', 4),
 ('check', 4),
 ('software', 4),
 ('employee', 3),
 ('gmail', 3),
 ('com', 3),
 ('skype', 3),
 ('and', 3),
 ('payslips', 3),
 ('connect', 3),
 ('for', 3),
 ('please', 3),
 ('etime', 3),
 ('telephony', 3),
 (':', 3),
 ('page', 2),
 ('error', 2),
 ('engineering', 2),
 ('company', 2),
 ('the', 2),
 ('no', 2),
 ('status', 2),
 ('open', 2),
 ('payslip', 2)]

In [37]:
# Import label encoder 
from sklearn import preprocessing 
  
# label_encoder object knows how to understand word labels. 
label_encoder = preprocessing.LabelEncoder() 
  
# Encode labels in column 'Target'. 
df_final['Target']= label_encoder.fit_transform(df_final['Target']) 
  
df_final['Target'].unique() 

array([ 0,  1, 23, 34, 45, 56, 67, 72, 73,  2,  3,  4,  5,  6,  7,  8,  9,
       10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 24, 25, 27, 28,
       29, 30, 31, 32, 33, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47,
       48, 49, 50, 51, 52, 53, 54, 55, 57, 58, 26, 59, 60, 61, 62, 63, 64,
       65, 66, 68, 69, 70, 71], dtype=int64)

In [40]:

maxlen = 1266
tokenizer = Tokenizer(num_words=maxlen, split=' ')
tokenizer.fit_on_texts(df_final['Description'].values)
X = tokenizer.texts_to_sequences(df_final['Description'].values)
X = pad_sequences(X)
Y = np.array(df_final['Target']).reshape(-1,1)
print("Number of Samples:", len(X))

print("Number of Labels: ", len(Y))

Number of Samples: 8500
Number of Labels:  8500


In [41]:
# creating one hot encoder object by default 
# entire data passed is one hot encoded 
onehotencoder = OneHotEncoder() 

Y = onehotencoder.fit_transform(Y).toarray()


In [42]:
#Splitting training and testing data
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.15, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(7225, 986) (7225, 74)
(1275, 986) (1275, 74)


 **Weight matrix using GloVe embeddings**

In [16]:
f = 'glove.6B.200d.txt'
#open('glove.6B.100d.txt')
embedd_index = {}
for line in f:
    val = line.split()
    word = val[0]
    coff = np.asarray(val[1:],dtype = 'float')
    embedd_index[word] = coff

#f.close()
print('Found %s word vectors.' % len(embedd_index))

Found 13 word vectors.


In [12]:

#embedd_index['good']

In [None]:

embedding_matrix = np.zeros((len(index_of_words) + 1, embed_num_dims))

tokens = []
labels = []

for word,i in index_of_words.items():
    temp = embedd_index.get(word)
    if temp is not None:
        embedding_matrix[i] = temp
        
#for plotting
        tokens.append(embedding_matrix[i])
        labels.append(word

In [None]:
embedding_matrix.shape

In [None]:
#TSNE algorithm used to visualize word embeddings having huge amount (100) dimensions

def tsne():
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tokens[:200])
    print(new_values.shape)
    
    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(16,16)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()

    
    
    
    
tsne()