In [12]:
from bs4 import BeautifulSoup
import nltk
nltk.download("punkt")
from nltk.corpus import stopwords
nltk.download("stopwords")
import pandas as pd
from tqdm.notebook import tqdm
tqdm.pandas()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
print(f'PRE-PROCESSING ON: StackSample_merged.csv')
stacksample = pd.read_csv(f'StackSample_merged.csv',sep=';')
stacksample.head()

PRE-PROCESSING ON: StackSample_merged.csv


Unnamed: 0,ID,Title,Body,Tags,Tag Count
0,0,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...,"['asp.net', 'sitemap', 'sql']",3
1,1,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,"['algorithm', 'colors', 'language-agnostic', '...",4
2,2,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...,"['c#', 'compiler-construction', '.net', 'scrip...",4
3,3,Should I use nested classes in this case?,<p>I am working on a collection of classes use...,"['nested-class', 'class', 'c++', 'oop']",4
4,4,Homegrown consumption of web services,<p>I've been writing a few web services for a ...,"['.net', 'web-services']",2


In [14]:
# parse for text
print("Removing HTML tags...")
stacksample["Body"] = stacksample["Body"].progress_apply(lambda text: BeautifulSoup(text,'lxml').text)

Removing HTML tags...


  0%|          | 0/71762 [00:00<?, ?it/s]

In [15]:
print("Converting to lower case...")
stacksample["Body"] = stacksample["Body"].str.lower()
stacksample["Title"] = stacksample["Title"].str.lower()

print("Tokenizing using regular expressions...")
pattern = r'''(?x)          # set flag to allow verbose regexps
    \w+[+#]+                # ending with pluses or hashes
    | \w+(?:[-.']+\w+)*     # words with optional internal special characters
    | \$?\d+(?:\.\d+)?%?    # currency and percentages, e.g. $12.40, 82%
    '''
stacksample["Tokenized Body"] = stacksample["Body"].progress_apply(lambda text: \
                                                                    nltk.regexp_tokenize(text, pattern))
stacksample["Tokenized Title"] = stacksample["Title"].progress_apply(lambda text: \
                                                                        nltk.regexp_tokenize(text, pattern))

Converting to lower case...
Tokenizing using regular expressions...


  0%|          | 0/71762 [00:00<?, ?it/s]

  0%|          | 0/71762 [00:00<?, ?it/s]

In [16]:
print("Removing useless stop words...")
stop_words = set(stopwords.words("english"))

def filter_stopwords(words):
    filtered_words = []
    for word in words:
        if word not in stop_words:
            filtered_words.append(word)
    return filtered_words

stacksample["Tokenized Body"] = stacksample["Tokenized Body"].progress_apply(filter_stopwords)
stacksample["Tokenized Title"] = stacksample["Tokenized Title"].progress_apply(filter_stopwords)
stacksample.head()

Removing useless stop words...


  0%|          | 0/71762 [00:00<?, ?it/s]

  0%|          | 0/71762 [00:00<?, ?it/s]

Unnamed: 0,ID,Title,Body,Tags,Tag Count,Tokenized Body,Tokenized Title
0,0,asp.net site maps,has anyone got experience creating sql-based a...,"['asp.net', 'sitemap', 'sql']",3,"[anyone, got, experience, creating, sql-based,...","[asp.net, site, maps]"
1,1,function for creating color wheels,this is something i've pseudo-solved many time...,"['algorithm', 'colors', 'language-agnostic', '...",4,"[something, i've, pseudo-solved, many, times, ...","[function, creating, color, wheels]"
2,2,adding scripting functionality to .net applica...,i have a little game written in c#. it uses a ...,"['c#', 'compiler-construction', '.net', 'scrip...",4,"[little, game, written, c#, uses, database, ba...","[adding, scripting, functionality, net, applic..."
3,3,should i use nested classes in this case?,i am working on a collection of classes used f...,"['nested-class', 'class', 'c++', 'oop']",4,"[working, collection, classes, used, video, pl...","[use, nested, classes, case]"
4,4,homegrown consumption of web services,i've been writing a few web services for a .ne...,"['.net', 'web-services']",2,"[i've, writing, web, services, net, app, i'm, ...","[homegrown, consumption, web, services]"


In [17]:
print("Converting to CSV format...")
stacksample[["ID","Tokenized Title","Tokenized Body","Tags","Tag Count"]].\
    to_csv(f"StackSample_Pre.csv",sep=";", index=False)

Converting to CSV format...


# Stemming and Lemmatization

In [18]:
df = pd.read_csv('StackSample_Pre.csv', sep=';')
df.head()

Unnamed: 0,ID,Tokenized Title,Tokenized Body,Tags,Tag Count
0,0,"['asp.net', 'site', 'maps']","['anyone', 'got', 'experience', 'creating', 's...","['asp.net', 'sitemap', 'sql']",3
1,1,"['function', 'creating', 'color', 'wheels']","['something', ""i've"", 'pseudo-solved', 'many',...","['algorithm', 'colors', 'language-agnostic', '...",4
2,2,"['adding', 'scripting', 'functionality', 'net'...","['little', 'game', 'written', 'c#', 'uses', 'd...","['c#', 'compiler-construction', '.net', 'scrip...",4
3,3,"['use', 'nested', 'classes', 'case']","['working', 'collection', 'classes', 'used', '...","['nested-class', 'class', 'c++', 'oop']",4
4,4,"['homegrown', 'consumption', 'web', 'services']","[""i've"", 'writing', 'web', 'services', 'net', ...","['.net', 'web-services']",2


In [19]:
import ast
from nltk.stem import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('wordnet')

stemmer = SnowballStemmer('english')
lemma = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [20]:
# stemming
df['Stemmed Body'] = df['Tokenized Body'].progress_apply(lambda x: [stemmer.stem(y) for y in ast.literal_eval(x)])
df['Stemmed Title'] = df['Tokenized Title'].progress_apply(lambda x: [stemmer.stem(y) for y in ast.literal_eval(x)])
df[["ID","Stemmed Title","Stemmed Body","Tags","Tag Count"]].to_csv(f"StackSample_Stemmed.csv",sep=";", index=False)

  0%|          | 0/71762 [00:00<?, ?it/s]

  0%|          | 0/71762 [00:00<?, ?it/s]

In [21]:
# lemmatization
df["Lemmatized Body"] = df["Tokenized Body"].progress_apply(lambda x: [lemma.lemmatize(y, pos="v") for y in ast.literal_eval(x)])
df["Lemmatized Title"] = df["Tokenized Title"].progress_apply(lambda x: [lemma.lemmatize(y, pos="v") for y in ast.literal_eval(x)])
df[["ID","Lemmatized Title","Lemmatized Body","Tags","Tag Count"]].to_csv(f"StackSample_Lemmatized.csv",sep=";", index=False)

  0%|          | 0/71762 [00:00<?, ?it/s]

  0%|          | 0/71762 [00:00<?, ?it/s]

In [22]:
df.head()

Unnamed: 0,ID,Tokenized Title,Tokenized Body,Tags,Tag Count,Stemmed Body,Stemmed Title,Lemmatized Body,Lemmatized Title
0,0,"['asp.net', 'site', 'maps']","['anyone', 'got', 'experience', 'creating', 's...","['asp.net', 'sitemap', 'sql']",3,"[anyon, got, experi, creat, sql-base, asp.net,...","[asp.net, site, map]","[anyone, get, experience, create, sql-based, a...","[asp.net, site, map]"
1,1,"['function', 'creating', 'color', 'wheels']","['something', ""i've"", 'pseudo-solved', 'many',...","['algorithm', 'colors', 'language-agnostic', '...",4,"[someth, i'v, pseudo-solv, mani, time, never, ...","[function, creat, color, wheel]","[something, i've, pseudo-solved, many, time, n...","[function, create, color, wheel]"
2,2,"['adding', 'scripting', 'functionality', 'net'...","['little', 'game', 'written', 'c#', 'uses', 'd...","['c#', 'compiler-construction', '.net', 'scrip...",4,"[littl, game, written, c#, use, databas, back-...","[ad, script, function, net, applic]","[little, game, write, c#, use, database, back-...","[add, script, functionality, net, applications]"
3,3,"['use', 'nested', 'classes', 'case']","['working', 'collection', 'classes', 'used', '...","['nested-class', 'class', 'c++', 'oop']",4,"[work, collect, class, use, video, playback, r...","[use, nest, class, case]","[work, collection, class, use, video, playback...","[use, nest, class, case]"
4,4,"['homegrown', 'consumption', 'web', 'services']","[""i've"", 'writing', 'web', 'services', 'net', ...","['.net', 'web-services']",2,"[i'v, write, web, servic, net, app, i'm, readi...","[homegrown, consumpt, web, servic]","[i've, write, web, service, net, app, i'm, rea...","[homegrown, consumption, web, service]"
