In [None]:
%%capture
# run this cell if this package is not installed
# %pip install nbformat

In [None]:
%%capture
%run 1-setup.ipynb

In [None]:
import re, string
from nltk.corpus import words, wordnet, stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, WordPunctTokenizer

In [None]:
def is_english_word(word):
    english_word_set = set(words.words())
    return word.lower() in english_word_set

In [None]:
def preprocess(doc):
    # Remove email addresses
    doc = re.sub(r'\b\S*@\S*\.\S*\b', '', doc)
    
    # Remove special characters and digits, retain only words with letters
    doc = re.sub(r'[^\w\s]', '', doc)
    
    # Lowercase and strip
    doc = doc.lower().strip()
    
    # Remove brackets of any kind
    doc = re.sub(r'[(){}[\]]', '', doc)
    
    # Remove punctuation
    doc = doc.translate(str.maketrans("", "", string.punctuation))
    
    # Tokenize document
    tokens = word_tokenize(doc)
    
    # POS tagging
    pos_tags = pos_tag(tokens)
    
     # map POS tags to WordNet POS tags
    tag_map = {
        'N': wordnet.NOUN,
        'V': wordnet.VERB,
        'R': wordnet.ADV,
        'J': wordnet.ADJ
    }

    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token, tag_map.get(pos[0], wordnet.NOUN)) for token, pos in pos_tags]
    
    
    # Filter stopwords out of lemmatized tokens
    stop_words = stopwords.words('english')
    
    stop_words.extend(['hi', 'thanks', 'lot', 'dont', 'article', 'everyone', 'anyone', 'someone', 'nothing',
                       'something', 'anything', 'everybody', 'somebody', 'anybody', 'please', 'ask', 'people', 'university',
                       'question', 'yeah', 'shouldnt', 'theyre', 'thing', 'theyll', 'didnt', 'sorry', 'hey',
                       'oh', 'thats', 'thank', 'cannot', 'right', 'would', 'one', 'get', 'know', 'like', 'use', 'go',
                       'think', 'make', 'say', 'see', 'also', 'could', 'well', 'want', 'way', 'take', 'find', 'need', 'try',
                       'much', 'come', 'many', 'may', 'give', 'really', 'tell', 'two', 'still', 'read', 'might', 'write',
                       'never', 'look', 'sure', 'day', 'even', 'new', 'time', 'good', 'first', 'keep', 'since', 'last', 
                       'long', 'fact', 'must', 'cant', 'another', 'little', 'without', 'csutexasedu', 'nntppostinghost',
                       'im', 'seem', 'replyto', 'let', 'group', 'call', 'seem', ])
    
    filtered_tokens = [token for token in lemmatized_tokens if token not in stop_words]
    
    # Recreate the document
    doc = ' '.join(filtered_tokens)
    
    return doc


In [None]:
df['Clean_Content'] = df['Content'].apply(preprocess)

In [None]:
df['Clean_Content'].iloc[1000]

In [None]:
# Check for empty documents
empty_documents = df[df['Clean_Content'].str.strip() == '']

# Count the number of empty documents
num_empty_documents = len(empty_documents)

if num_empty_documents > 0:
    print(f"Number of empty documents: {num_empty_documents}")
    print("Indices of empty documents:")
    print(empty_documents.index)
else:
    print("No empty documents found.")