In [None]:
from bs4 import BeautifulSoup
import nltk
nltk.download("punkt")
from nltk.corpus import stopwords
nltk.download("stopwords")
import pandas as pd
from tqdm.notebook import tqdm
tqdm.pandas()

In [None]:
print(f'PRE-PROCESSING ON: StackSample_merged.csv')
stacksample = pd.read_csv(f'StackSample_merged.csv',sep=';')
stacksample.head()

In [None]:
# parse for text
print("Removing HTML tags...")
stacksample["Body"] = stacksample["Body"].progress_apply(lambda text: BeautifulSoup(text,'lxml').text)

In [None]:
print("Converting to lower case...")
stacksample["Body"] = stacksample["Body"].str.lower()
stacksample["Title"] = stacksample["Title"].str.lower()

print("Tokenizing using regular expressions...")
pattern = r'''(?x)          # set flag to allow verbose regexps
    \w+[+#]+                # ending with pluses or hashes
    | \w+(?:[-.']+\w+)*     # words with optional internal special characters
    | \$?\d+(?:\.\d+)?%?    # currency and percentages, e.g. $12.40, 82%
    '''
stacksample["Tokenized Body"] = stacksample["Body"].progress_apply(lambda text: \
                                                                    nltk.regexp_tokenize(text, pattern))
stacksample["Tokenized Title"] = stacksample["Title"].progress_apply(lambda text: \
                                                                        nltk.regexp_tokenize(text, pattern))

In [None]:
print("Removing useless stop words...")
stop_words = set(stopwords.words("english"))

def filter_stopwords(words):
    filtered_words = []
    for word in words:
        if word not in stop_words:
            filtered_words.append(word)
    return filtered_words

stacksample["Tokenized Body"] = stacksample["Tokenized Body"].progress_apply(filter_stopwords)
stacksample["Tokenized Title"] = stacksample["Tokenized Title"].progress_apply(filter_stopwords)
stacksample.head()

In [None]:
print("Converting to CSV format...")
stacksample[["ID","Tokenized Title","Tokenized Body","Tags","Tag Count"]].\
    to_csv(f"StackSample_Pre.csv",sep=";", index=False)

# Stemming and Lemmatization

In [None]:
df = pd.read_csv('StackSample_Pre.csv', sep=';')
df.head()

In [None]:
import ast
from nltk.stem import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('wordnet')

stemmer = SnowballStemmer('english')
lemma = WordNetLemmatizer()

In [None]:
# stemming
df['Stemmed Body'] = df['Tokenized Body'].progress_apply(lambda x: [stemmer.stem(y) for y in ast.literal_eval(x)])
df['Stemmed Title'] = df['Tokenized Title'].progress_apply(lambda x: [stemmer.stem(y) for y in ast.literal_eval(x)])
df[["ID","Stemmed Title","Stemmed Body","Tags","Tag Count"]].to_csv(f"StackSample_Stemmed.csv",sep=";", index=False)

In [None]:
# lemmatization
df["Lemmatized Body"] = df["Tokenized Body"].progress_apply(lambda x: [lemma.lemmatize(y, pos="v") for y in ast.literal_eval(x)])
df["Lemmatized Title"] = df["Tokenized Title"].progress_apply(lambda x: [lemma.lemmatize(y, pos="v") for y in ast.literal_eval(x)])
df[["ID","Lemmatized Title","Lemmatized Body","Tags","Tag Count"]].to_csv(f"StackSample_Lemmatized.csv",sep=";", index=False)

In [None]:
df.head()