In [1]:
%%capture
# run this cell if this package is not installed
# %pip install nbformat

In [2]:
%%capture
%run 1-setup.ipynb

In [3]:
import nltk, re, string
from nltk.corpus import words
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet

In [4]:
english_word_set = set(words.words())

def is_english_word(word):
    return word.lower() in english_word_set

In [5]:
def preprocess(doc):
    wpt = nltk.WordPunctTokenizer()
    lemmatizer = WordNetLemmatizer()
    stop_words = nltk.corpus.stopwords.words('english')
    
    # we extend the stop words to avoid retaining based on the words that appear in the final model
    stop_words.extend(['hi', 'thanks', 'lot', 'dont', 'article', 'everyone', 'anyone', 'someone', 'nothing',
                       'something', 'anything', 'everybody', 'somebody', 'anybody', 'please', 'ask', 'mo', 'mon', 'eh', 'da', 
                       'th', 'dude', 'sh', 'ra', 'li', 'ce', 'people', 'university', 'dod',
                       'question', 'yeah', 'shouldnt', 'theyre', 'thing', 'theyll', 'didnt', 'sorry', 'hey',
                       'oh', 'thats', 'thank', 'cannot', 'right'])
    
    # remove the names and surnames that appear in the final models
    stop_words.extend(['kent', 'sandvik', 'spencer', 'peter', 'henry', 'jack', 'frank', 'smith',
                       'larry', 'marc', 'miller', 'mike', 'jeff', 'martin', 'bob', 'alan', 'rob', ])
    
    # remove email addresses
    doc = re.sub(r'\b\S*@\S*\.\S*\b', '', doc)
    
    # remove special characters and digits, retaining only words with letters
    doc = re.sub(r'[^\w\s]', '', doc)
    
    # lowercase and strip
    doc = doc.lower()
    doc = doc.strip()
    
    # remove brackets of any kind
    doc = re.sub(r'[(){}[\]]', '', doc)
    
    # remove punctuation
    doc = doc.translate(str.maketrans("", "", string.punctuation))

    # retain only English words
    doc = ' '.join(word for word in doc.split() if is_english_word(word))
    
    # tokenize document
    tokens = wpt.tokenize(doc)
    
    # determine POS of the tokens
    pos_tags = pos_tag(tokens)
    
    # # map POS tags to WordNet POS tags
    # tag_map = {
    #     'N': wordnet.NOUN,
    #     'V': wordnet.VERB,
    #     'R': wordnet.ADV,
    #     'J': wordnet.ADJ
    # }

    # # lemmatize the tokens
    # lemmatized_tokens = [lemmatizer.lemmatize(token, tag_map.get(pos[0], wordnet.NOUN)) for token, pos in pos_tags]
    
    # keep only nouns
    lemmatized_tokens = [lemmatizer.lemmatize(token, wordnet.NOUN) for token, pos in pos_tags if pos.startswith('N')]
    
    # filter stopwords out of lemmatized tokens 
    filtered_tokens = [token for token in lemmatized_tokens if token not in stop_words]
    
    # recreate the document
    doc = ' '.join(filtered_tokens)
    
    return doc

In [6]:
df_clean = df['Content'].apply(preprocess)

In [7]:
df_clean.iloc[1000]

'surface organization research center canada surface topic information surface help surface plane cut cylinder cone plane course surface patch vanishing curvature ie matrix patch surface plane sphere earth way curvature look book geometry book author geometry year publisher note enjoy st canada'

In [12]:
# Check for empty documents
empty_documents = df_clean[df_clean.str.strip() == '']

# Count the number of empty documents
num_empty_documents = len(empty_documents)

if num_empty_documents > 0:
    print(f"Number of empty documents: {num_empty_documents}")
    print("Indices of empty documents:")
    print(empty_documents.index)
else:
    print("No empty documents found.")

No empty documents found.
