In [None]:
from bs4 import BeautifulSoup
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download("stopwords")
import pandas as pd
import sys
import csv
import os
from tqdm import tqdm, tqdm_notebook
tqdm_notebook().pandas()
import re

In [None]:
for count in range(1,5):
    print(f'PRE-PROCESSING ON: StackSample{count}.csv')
    stacksample = pd.read_csv(f'StackSample{count}.csv',sep=';')
    display(stacksample.head())
    
    print("Removing HTML tags...")
    stacksample["Body"] = stacksample["Body"].progress_apply(lambda text: BeautifulSoup(text,'lxml').text)

    print("Converting to lower case...")
    stacksample["Body"] = stacksample["Body"].str.lower()
    stacksample["Title"] = stacksample["Title"].str.lower()
    
    print("Tokenizing using regular expressions...")
    pattern = r'''(?x)       # set flag to allow verbose regexps
        \w+[+#]+             # ending with pluses or hashes
        | \w+(?:[-.']+\w+)*  # words with optional internal special characters
        | \$?\d+(?:\.\d+)?%? # currency and percentages, e.g. $12.40, 82%
        '''
    stacksample["Tokenized Body"] = stacksample["Body"].progress_apply(lambda text: \
                                                                       nltk.regexp_tokenize(text, pattern))
    stacksample["Tokenized Title"] = stacksample["Title"].progress_apply(lambda text: \
                                                                         nltk.regexp_tokenize(text, pattern))
    
    print("Removing useless stop words...")
    stop_words = set(stopwords.words("english"))
    
    def filter_stopwords(words):
        filtered_words = []
        for word in words:
            if word not in stop_words:
                filtered_words.append(word)
        return filtered_words

    stacksample["Tokenized Body"] = stacksample["Tokenized Body"].progress_apply(filter_stopwords)
    stacksample["Tokenized Title"] = stacksample["Tokenized Title"].progress_apply(filter_stopwords)
    display(stacksample.head())
    
    print("Converting to CSV format...")
    stacksample[["ID","Tokenized Title","Tokenized Body","Tags","Tag Count"]].\
    to_csv(f"StackSample{count}_Pre.csv",sep=";", index=False)
    pp = pd.read_csv(f"StackSample{count}_Pre.csv",sep=";")
    display(pp.head())    

# Stemming and Lemmatization

In [None]:
df = pd.read_csv('StackSample1_Pre.csv', sep=';')
df.head()

In [None]:
import ast
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('wordnet')

stemmer = PorterStemmer()
lemma = WordNetLemmatizer()

In [None]:
df['Tokenized Title stem'] = df['Tokenized Title'].progress_apply(lambda x: [stemmer.stem(y) for y in ast.literal_eval(x)])
df['Tokenized Body stem'] = df['Tokenized Body'].progress_apply(lambda x: [stemmer.stem(y) for y in ast.literal_eval(x)])
df.head()

In [None]:
df[["ID","Tokenized Title stem","Tokenized Body stem","Tags","Tag Count"]].to_csv(f"StackSample1_Pre_stem.csv",sep=";", index=False)

In [None]:
#df["Tokenized Body lemm"] = df["Tokenized Body"].progress_apply(lambda x: [lemma.lemmatize(y, pos="v") for y in ast.literal_eval(x)])
df["Tokenized Title lemm"] = df["Tokenized Title"].progress_apply(lambda x: [lemma.lemmatize(y, pos="v") for y in ast.literal_eval(x)])
df.head()

In [None]:
df[["ID","Tokenized Title lemm","Tokenized Body lemm","Tags","Tag Count"]].to_csv(f"StackSample1_Pre_lemm.csv",sep=";", index=False)