In [1]:
%%capture
# run this cell if this package is not installed
# !pip install nbformat

In [1]:
%%capture
%run setup.ipynb

In [2]:
english_word_set = set(words.words())

def is_english_word(word):
    return word.lower() in english_word_set

In [3]:
def preprocess(doc):
    wpt = nltk.WordPunctTokenizer()
    lemmatizer = WordNetLemmatizer()
    stop_words = set(nltk.corpus.stopwords.words('english'))

    # sentences to be removed
    p1 = "Chat Conversation Start"
    p2 = "Chat Conversation End"
    
    # Remove p1 only the first time it appears
    doc = re.sub(rf'^\s*{re.escape(p1)}\s*', '', doc)

    # Remove p2 only the last time it appears
    doc = re.sub(rf'\s*{re.escape(p2)}\s*$', '', doc)
    
    # remove email addresses
    doc = re.sub(r'\b\S*@\S*\.\S*\b', '', doc)
    
    # remove special characters and digits, retaining only words with letters
    doc = re.sub(r'[^\w\s]', '', doc)
    
    # lowercase and strip
    doc = doc.lower()
    doc = doc.strip()
    
    # remove brackets of any kind
    doc = re.sub(r'[(){}[\]]', '', doc)
    
    # remove punctuation
    doc = doc.translate(str.maketrans("", "", string.punctuation))

    # retain only English words
    doc = ' '.join(word for word in doc.split() if is_english_word(word))
    
    # tokenize document
    tokens = wpt.tokenize(doc)
    
    # determine POS of the tokens
    pos_tags = pos_tag(tokens)
    
    # map POS tags to WordNet POS tags
    tag_map = {
        'N': wordnet.NOUN,
        'V': wordnet.VERB,
        'R': wordnet.ADV,
        'J': wordnet.ADJ
    }

    # lemmatize the tokens
    lemmatized_tokens = [lemmatizer.lemmatize(token, tag_map.get(pos[0], wordnet.NOUN)) for token, pos in pos_tags]
    
    # filter stopwords out of lemmatized tokens 
    filtered_tokens = [token for token in lemmatized_tokens if token not in stop_words]
    
    # recreate the document
    doc = ' '.join(filtered_tokens)
    
    return doc

In [4]:
df_clean = df['Content'].apply(preprocess)

In [5]:
df_clean.iloc[1000]

'subject developable surface organization university hong tin version hi currently developable surface anyone familiar topic give information allow find developable surface thanks help'

In [6]:
tv = TfidfVectorizer(min_df=0., max_df=1., norm="l2",
                     use_idf=True, smooth_idf=True)
tv_matrix = tv.fit_transform(df_clean)
tv_matrix = tv_matrix.toarray()
vocab = tv.get_feature_names_out()
pd.DataFrame(np.round(tv_matrix, 2), columns=vocab)

Unnamed: 0,aa,abacus,abandon,abandoned,abandonment,abate,abatement,abbey,abbot,abbreviation,...,zombie,zone,zoning,zoo,zooid,zoological,zoology,zoom,zorro,zucchini
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11331,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11332,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11334,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# which entries are non zero?
tfidf = pd.DataFrame(np.round(tv_matrix, 2), columns=vocab)
tfdidf_nonzero = tfidf[tfidf != 0].stack()
tfdidf_nonzero

0      ask        0.08
       back       0.06
       bed        0.14
       cat        0.12
       city       0.09
                  ... 
11335  without    0.06
       work       0.02
       would      0.02
       write      0.03
       yep        0.06
Length: 756273, dtype: float64

In [8]:
tfdidf_nonzero[0]

ask           0.08
back          0.06
bed           0.14
cat           0.12
city          0.09
come          0.06
crazy         0.23
enjoy         0.11
everything    0.16
excite        0.14
fantastic     0.14
fine          0.09
friend        0.36
fun           0.10
go            0.05
good          0.29
great         0.14
happy         0.10
home          0.08
hope          0.16
house         0.09
ill           0.08
jersey        0.12
last          0.06
later         0.09
like          0.04
long          0.13
lot           0.20
lovely        0.15
maybe         0.07
meet          0.10
miss          0.19
new           0.11
news          0.08
one           0.04
plan          0.10
really        0.12
remember      0.08
repeat        0.11
sad           0.12
safe          0.10
see           0.05
since         0.06
smile         0.14
soon          0.18
spend         0.09
spent         0.14
still         0.06
time          0.24
travel        0.23
try           0.06
want          0.06
week        