-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocessor.py
57 lines (46 loc) · 1.74 KB
/
preprocessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import string
import nltk
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
class Preprocessor:
def __init__(self, original_text):
self.original_text = original_text
def get_sentences(self):
if (hasattr(self, "sentences")):
return self.sentences
self.sentences = sent_tokenize(self.original_text)
return self.sentences
def get_words(self):
if (hasattr(self, "words")):
return self.words
sentences = self.get_sentences()
self.words = [word_tokenize(sen) for sen in sentences]
return self.words
def get_english_stopwords(self):
if (hasattr(self, "english_stopwords")):
return self.english_stopwords
self.english_stopwords = stopwords.words("english")
return self.english_stopwords
def get_english_punctuations(self):
if (hasattr(self, "punctuations")):
return self.punctuations
self.punctuations = list(string.punctuation)
return self.punctuations
def get_clean_words(self):
if (hasattr(self, "clean_words")):
return self.clean_words
words_lol = self.get_words()
stopwords = self.get_english_stopwords()
punctuations = self.get_english_punctuations()
new_words_lol = []
for words_list in words_lol:
new_words_list = []
for word in words_list:
lower = word.lower()
if lower not in stopwords and lower not in punctuations:
new_words_list.append(lower)
new_words_lol.append(new_words_list)
self.clean_words = new_words_lol
return self.clean_words