In [1]:
%%capture
# run this cell if this package is not installed
# !pip install nbformat

In [1]:
%%capture
%run setup.ipynb

In [2]:
english_word_set = set(words.words())

def is_english_word(word):
    return word.lower() in english_word_set

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [3]:
def preprocess(doc):
    wpt = nltk.WordPunctTokenizer()
    lemmatizer = WordNetLemmatizer()
    stop_words = set(nltk.corpus.stopwords.words('english'))

    # sentences to be removed
    p1 = "Chat Conversation Start"
    p2 = "Chat Conversation End"
    
    # Remove p1 only the first time it appears
    doc = re.sub(rf'^\s*{re.escape(p1)}\s*', '', doc)

    # Remove p2 only the last time it appears
    doc = re.sub(rf'\s*{re.escape(p2)}\s*$', '', doc)
    
    # remove email addresses
    doc = re.sub(r'\b\S*@\S*\.\S*\b', '', doc)
    
    # remove special characters and digits, retaining only words with letters
    doc = re.sub(r'[^\w\s]', '', doc)
    
    # lowercase and strip
    doc = doc.lower()
    doc = doc.strip()
    
    # remove brackets of any kind
    doc = re.sub(r'[(){}[\]]', '', doc)
    
    # remove punctuation
    doc = doc.translate(str.maketrans("", "", string.punctuation))

    # retain only English words
    doc = ' '.join(word for word in doc.split() if is_english_word(word))
    
    # tokenize document
    tokens = wpt.tokenize(doc)
    
    # lemmatize the tokens
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # filter stopwords out of lemmatized tokens 
    filtered_tokens = [token for token in lemmatized_tokens if token not in stop_words]
    
    # recreate the document
    doc = ' '.join(filtered_tokens)
    
    return doc



In [4]:
df_clean = df['Content'].apply(preprocess)

In [5]:
df_clean.iloc[1000]

'subject developable surface organization university hong tin version hi currently developable surface anyone familiar topic give information allow find developable surface thanks help'

In [6]:
tv = TfidfVectorizer(min_df=0., max_df=1., norm="l2",
                     use_idf=True, smooth_idf=True)
tv_matrix = tv.fit_transform(df_clean)
tv_matrix = tv_matrix.toarray()
vocab = tv.get_feature_names_out()
pd.DataFrame(np.round(tv_matrix, 2), columns=vocab)

Unnamed: 0,aa,abacus,abandon,abandoned,abandonment,abate,abatement,abbey,abbot,abbreviation,...,zombie,zone,zoning,zoo,zooid,zoological,zoology,zoom,zorro,zucchini
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11331,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11332,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11334,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
