-
Notifications
You must be signed in to change notification settings - Fork 0
/
NLP-Preprocessing.py
67 lines (51 loc) · 1.31 KB
/
NLP-Preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
'''
Description: Standalone file can be used for text pre-processing in NLP.
'''
import re
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
html_tag = '<[^<>]+>'
number_tag = '[0-9]+'
url_tag = '(http|https)://[^\s]*'
emailaddr_tag ='[^\s]+@[^\s]+'
currency_tag = '[$]+'
alphanumeric_tag = '[^a-zA-Z0-9]'
f = open('emailSample1.txt','r')
email = f.read()
f.close()
email = email.lower()
'''
% Strip all HTML
% Looks for any expression that starts with < and ends with > and replace
% and does not have any < or > in the tag it with a space
'''
email = re.sub(html_tag,' ', email)
'''
% Handle Numbers
% Look for one or more characters between 0-9
'''
email = re.sub(number_tag,'number', email)
'''
% Handle URLS
% Look for strings starting with http:// or https://
'''
email = re.sub(url_tag,'httpaddr', email)
'''
% Handle Email Addresses
% Look for strings with @ in the middle
'''
email = re.sub(emailaddr_tag,'emailaddr', email)
'''
Handle $ sign
'''
email = re.sub(currency_tag,'dollar', email)
'''
% Remove any non alphanumeric characters
'''
#email = re.sub(alphanumeric_tag,'', email)
tokenizer = RegexpTokenizer(r'\w+')
email = tokenizer.tokenize(email)
ps = PorterStemmer()
email = [ps.stem(word).encode('utf-8') for word in email]
#print email
print(' '.join(email))