In [26]:
import nltk
import time
import wikipediaapi 
import spacy
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.tokenize import word_tokenize

In [2]:
# Step 1: Fetch Wikipedia content
def get_wikipedia_text(page_title):
	wiki_wiki = wikipediaapi.Wikipedia(user_agent="MyNLPProject/1.0", language="en")
	page = wiki_wiki.page(page_title)
	return page.summary if page.exists() else ""
	
text = get_wikipedia_text("Python_(programming_language)")

In [3]:
tokens = word_tokenize(text)

print(tokens)

['Python', 'is', 'a', 'high-level', ',', 'general-purpose', 'programming', 'language', '.', 'Its', 'design', 'philosophy', 'emphasizes', 'code', 'readability', 'with', 'the', 'use', 'of', 'significant', 'indentation', '.', 'Python', 'is', 'dynamically', 'type-checked', 'and', 'garbage-collected', '.', 'It', 'supports', 'multiple', 'programming', 'paradigms', ',', 'including', 'structured', '(', 'particularly', 'procedural', ')', ',', 'object-oriented', 'and', 'functional', 'programming', '.', 'It', 'is', 'often', 'described', 'as', 'a', '``', 'batteries', 'included', "''", 'language', 'due', 'to', 'its', 'comprehensive', 'standard', 'library', '.', 'Guido', 'van', 'Rossum', 'began', 'working', 'on', 'Python', 'in', 'the', 'late', '1980s', 'as', 'a', 'successor', 'to', 'the', 'ABC', 'programming', 'language', 'and', 'first', 'released', 'it', 'in', '1991', 'as', 'Python', '0.9.0', '.', 'Python', '2.0', 'was', 'released', 'in', '2000', '.', 'Python', '3.0', ',', 'released', 'in', '2008',

In [None]:
stemmer = PorterStemmer()

start_stem = time.time()

stemmed_words = [stemmer.stem(word) for word in tokens]

end_stem = time.time()

In [None]:
nlp = spacy.load("en_core_web_sm")

start_lem = time.time()

doc = nlp(" ".join(tokens))

lemmatized_words = [token.lemma_ for token in doc]

end_lem = time.time()

  from .autonotebook import tqdm as notebook_tqdm


## Performance

In [None]:
print(f"Original Text Sample:\t{tokens[:10]}")
print(f"Stemmed Words:\t\t{stemmed_words[:10]}")
print(f"Lemmatized Words:\t{lemmatized_words[:10]}")

print("\nPerformance Analysis:")
print(f"Stemming Execution Time: {end_stem - start_stem:.5f} seconds")
print(f"Lemmatization Execution Time: {end_lem - start_lem:.5f} seconds")

Original Text Sample:	['Python', 'is', 'a', 'high-level', ',', 'general-purpose', 'programming', 'language', '.', 'Its']
Stemmed Words:		['python', 'is', 'a', 'high-level', ',', 'general-purpos', 'program', 'languag', '.', 'it']
Lemmatized Words:	['Python', 'be', 'a', 'high', '-', 'level', ',', 'general', '-', 'purpose']

Performance Analysis:
Stemming Execution Time: 0.00648 seconds
Lemmatization Execution Time: 0.05436 seconds


## Stemming vs Lemmatisation

In [18]:
# Stemming vs lemmatisation

word = "dancing"

print(f"Original:\t{word}")
print(f"Stemming:\t{stemmer.stem(word)}")
print(f"Lemmatisation:\t{nlp(word)[0].lemma_}")

Original:	dancing
Stemming:	danc
Lemmatisation:	dance


## Snowball vs Porter Stemmer

In [30]:
def test(tokens):

	porter = PorterStemmer()
	porter_words = [porter.stem(word) for word in tokens]

	snow = SnowballStemmer("english")
	snow_words = [snow.stem(word) for word in tokens]
	
	print(f"Porter:\t{porter_words}")
	print(f"Snow:\t{snow_words}")

	print()
	
	for i in range(len(tokens)):
		if(snow_words[i] != porter_words[i]):
			print(f"Original: {tokens[i]}")
			print(f"Port: {porter_words[i]}")
			print(f"Snow: {snow_words[i]}")
			print()


	

test(tokens)

Porter:	['python', 'is', 'a', 'high-level', ',', 'general-purpos', 'program', 'languag', '.', 'it', 'design', 'philosophi', 'emphas', 'code', 'readabl', 'with', 'the', 'use', 'of', 'signific', 'indent', '.', 'python', 'is', 'dynam', 'type-check', 'and', 'garbage-collect', '.', 'it', 'support', 'multipl', 'program', 'paradigm', ',', 'includ', 'structur', '(', 'particularli', 'procedur', ')', ',', 'object-ori', 'and', 'function', 'program', '.', 'it', 'is', 'often', 'describ', 'as', 'a', '``', 'batteri', 'includ', "''", 'languag', 'due', 'to', 'it', 'comprehens', 'standard', 'librari', '.', 'guido', 'van', 'rossum', 'began', 'work', 'on', 'python', 'in', 'the', 'late', '1980', 'as', 'a', 'successor', 'to', 'the', 'abc', 'program', 'languag', 'and', 'first', 'releas', 'it', 'in', '1991', 'as', 'python', '0.9.0', '.', 'python', '2.0', 'wa', 'releas', 'in', '2000', '.', 'python', '3.0', ',', 'releas', 'in', '2008', ',', 'wa', 'a', 'major', 'revis', 'not', 'complet', 'backward-compat', 'with