# Import necessary packages

In [236]:
from sklearn import preprocessing
import numpy as np
import pandas as pd

In [237]:
def removeString(data, regex):
    return data.str.lower().str.replace(regex.lower(), ' ')

In [238]:
def cleanDataset(dataset, columnsToClean, regexList):
    for column in columnsToClean:
        for regex in regexList:
            dataset[column] = removeString(dataset[column], regex)
    return dataset

In [239]:
def getRegexList():
    '''
    Adding regex list as per the given data set to flush off the unnecessary text
    
    '''
    regexList = []
    regexList += ['From:(.*)\r\n']  # from line
    regexList += ['Sent:(.*)\r\n']  # sent to line
    regexList += ['received from:(.*)\r\n']  # received data line
    regexList += ['received']  # received data line
    regexList += ['To:(.*)\r\n']  # to line
    regexList += ['CC:(.*)\r\n']  # cc line
    regexList += ['(.*)infection']  # footer
    regexList += ['\[cid:(.*)]']  # images cid
    regexList += ['https?:[^\]\n\r]+']  # https & http
    regexList += ['Subject:']
    regexList += ['[\w\d\-\_\.]+@[\w\d\-\_\.]+']  # emails are not required
    regexList += ['[0-9][\-0–90-9 ]+']  # phones are not required
    regexList += ['[0-9]']  # numbers not needed
    regexList += ['[^a-zA-z 0-9]+']  # anything that is not a letter
    regexList += ['[\r\n]']  # \r\n
    regexList += [' [a-zA-Z] ']  # single letters makes no sense
    regexList += [' [a-zA-Z][a-zA-Z] ']  # two-letter words makes no sense
    regexList += ["  "]  # double spaces
    
    regexList += ['^[_a-z0-9-]+(\.[_a-z0-9-]+)*@[a-z0-9-]+(\.[a-z0-9-]+)*(\.[a-z]{2,4})$']
    regexList += ['[\w\d\-\_\.]+ @ [\w\d\-\_\.]+']
    regexList += ['Subject:']
    regexList += ['[^a-zA-Z]']

    return regexList

# Reading, Understanding the structure of the dataset

In [240]:
# Read the dataset from excel
dfIncidents = pd.read_excel('./datasets/input_data.xlsx')

In [241]:
# get the shape to understand the dataset
dfIncidents.shape

(8500, 4)

In [242]:
# rename columns to better make sense
dfIncidents.rename(columns = {'Short description': 'Title'}, inplace = True)

In [243]:
dfIncidents.head(10)

Unnamed: 0,Title,Description,Caller,Assignment group
0,login issue,-verified user details.(employee# & manager na...,spxjnwir pjlcoqds,GRP_0
1,outlook,\r\n\r\nreceived from: hmjdrvpb.komuaywn@gmail...,hmjdrvpb komuaywn,GRP_0
2,cant log in to vpn,\r\n\r\nreceived from: eylqgodm.ybqkwiam@gmail...,eylqgodm ybqkwiam,GRP_0
3,unable to access hr_tool page,unable to access hr_tool page,xbkucsvz gcpydteq,GRP_0
4,skype error,skype error,owlgqjme qhcozdfx,GRP_0
5,unable to log in to engineering tool and skype,unable to log in to engineering tool and skype,eflahbxn ltdgrvkz,GRP_0
6,event: critical:HostName_221.company.com the v...,event: critical:HostName_221.company.com the v...,jyoqwxhz clhxsoqy,GRP_1
7,ticket_no1550391- employment status - new non-...,ticket_no1550391- employment status - new non-...,eqzibjhw ymebpoih,GRP_0
8,unable to disable add ins on outlook,unable to disable add ins on outlook,mdbegvct dbvichlg,GRP_0
9,ticket update on inplant_874773,ticket update on inplant_874773,fumkcsji sarmtlhy,GRP_0


# Remove text with Regex

In [244]:
# Select columns for cleaning
columnsToClean = ['Title', 'Description']

# Create list of regex to remove sensitive data
# Clean dataset and remove sensitive data
clean_df_tickets = cleanDataset(dfIncidents, columnsToClean, getRegexList())

In [245]:
clean_df_tickets.head(10)

Unnamed: 0,Title,Description,Caller,Assignment group
0,login issue,verified user details employee manager name ...,spxjnwir pjlcoqds,GRP_0
1,outlook,hello team meetings skype meetings etc are ...,hmjdrvpb komuaywn,GRP_0
2,cant log to vpn,cannot log to vpn best,eylqgodm ybqkwiam,GRP_0
3,unable access tool page,unable access tool page,xbkucsvz gcpydteq,GRP_0
4,skype error,skype error,owlgqjme qhcozdfx,GRP_0
5,unable log to engineering tool and skype,unable log to engineering tool and skype,eflahbxn ltdgrvkz,GRP_0
6,event critical hostname company com the value...,event critical hostname company com the value...,jyoqwxhz clhxsoqy,GRP_1
7,ticket employment status new non employee ent...,ticket employment status new non employee ent...,eqzibjhw ymebpoih,GRP_0
8,unable disable add ins outlook,unable disable add ins outlook,mdbegvct dbvichlg,GRP_0
9,ticket update inplant,ticket update inplant,fumkcsji sarmtlhy,GRP_0


In [246]:
print(clean_df_tickets.shape)

(8500, 4)


In [247]:
clean_df_tickets

Unnamed: 0,Title,Description,Caller,Assignment group
0,login issue,verified user details employee manager name ...,spxjnwir pjlcoqds,GRP_0
1,outlook,hello team meetings skype meetings etc are ...,hmjdrvpb komuaywn,GRP_0
2,cant log to vpn,cannot log to vpn best,eylqgodm ybqkwiam,GRP_0
3,unable access tool page,unable access tool page,xbkucsvz gcpydteq,GRP_0
4,skype error,skype error,owlgqjme qhcozdfx,GRP_0
...,...,...,...,...
8495,emails not coming from mail,good afternoon not receiving the emails tha...,avglmrts vhqmtiua,GRP_29
8496,telephony software issue,telephony software issue,rbozivdq gmlhrtvp,GRP_0
8497,vip windows password reset for tifpdchb pedxruyf,vip windows password reset for tifpdchb pedxruyf,oybwdsgx oxyhwrfz,GRP_0
8498,machine o est funcionando,i unable access the machine utilities finish t...,ufawcgob aowhxjky,GRP_62


# Saving the cleansed data for further EDA

In [249]:

dfx = pd.DataFrame(clean_df_tickets)
dfx.to_csv("./datasets/pre-processed-data_v1.csv", index=False, index_label=False)