### Task 1: Tokenize via NLTK

In [None]:
import nltk
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

#nltk stopwords
stop_words = set(stopwords.words('english'))

#read csv
df = pd.read_csv('news_sample.csv', encoding='utf-8')

In [11]:
#makes csv to a single string
text = " ".join(df.astype(str).agg(" ".join, axis=1))

#tokenize without punkt, only words
word_tokens = re.findall(r'\b\w+\b', text)

#original vocab length
OG_vocab_size = len(set(word_tokens))

#remove stopwords
filtered_tokens = [w for w in word_tokens if w.lower() not in stop_words]
filtered_vocab_size = len(set(filtered_tokens))

#stopword reduction rate
stopword_reduction_rate = ((OG_vocab_size - filtered_vocab_size) / OG_vocab_size * 100)

#stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(w) for w in filtered_tokens]
stemmed_vocab_size = len(set(stemmed_tokens))

#stem reduction rate
stemming_reduction_rate = ((filtered_vocab_size - stemmed_vocab_size) / filtered_vocab_size * 100)

#print stuff
print(f"Original Vocabulary Size: {OG_vocab_size}")
print(f"Vocabulary Size After Stopword Removal: {filtered_vocab_size}")
print(f"Reduction Rate After Stopword Removal: {stopword_reduction_rate:.2f}%")
print(f"Vocabulary Size After Stemming: {stemmed_vocab_size}")
print(f"Reduction Rate After Stemming: {stemming_reduction_rate:.2f}%")

Original Vocabulary Size: 21102
Vocabulary Size After Stopword Removal: 20756
Reduction Rate After Stopword Removal: 1.64%
Vocabulary Size After Stemming: 11827
Reduction Rate After Stemming: 43.02%


To do this we use the given nltk, we use pandas to load the csv file and we use regex for tokenize without punkt. It was very appropriate to use nltk to get a list of very commons stopwords in the text. We use pandas because it lead to a very simple way of loading the csv file. 



Vi er goated lets goo, tester også
test igen