-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess.py
76 lines (57 loc) · 2.12 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# Function to get video transcript
import re
import html
import unicodedata
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
# Define stopwords
stop_words = set(stopwords.words('english'))
# Initialize preprocessed_transcript
preprocessed_transcript = ""
# Preprocessing functions
def remove_special_chars(text):
re1 = re.compile(r' +')
x1 = text.lower().replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
'<br />', "\n").replace('\\"', '"').replace('<unk>', 'u_n').replace(' @.@ ', '.').replace(
' @-@ ', '-').replace('\\', ' \\ ')
return re1.sub(' ', html.unescape(x1))
def remove_non_ascii(text):
return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
def to_lowercase(text):
return text.lower()
def remove_punctuation(text):
translator = str.maketrans('', '', string.punctuation)
return text.translate(translator)
def replace_numbers(text):
return re.sub(r'\d+', '', text)
def remove_whitespaces(text):
return text.strip()
def text2words(text):
return word_tokenize(text)
# Lemmatization function
def lemmatize_words(words):
lemmatizer = WordNetLemmatizer()
return [lemmatizer.lemmatize(word) for word in words]
# Function to remove stopwords and lemmatize
def preprocess_words(words):
words = [word for word in words if word not in stop_words]
words = lemmatize_words(words)
return words
# Define the normalization function
def normalize_text(text):
text = remove_special_chars(text)
text = remove_non_ascii(text)
text = remove_punctuation(text)
text = to_lowercase(text)
text = replace_numbers(text)
text = remove_whitespaces(text)
words = text2words(text)
return ' '.join(preprocess_words(words))