<a href="https://colab.research.google.com/github/DiaaEssam/Web-Scraping-and-Text-Preprocessing/blob/main/Web_Scraping_and_Text_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing libraries

In [None]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import nltk
from nltk.stem import SnowballStemmer
import re

# Downloading corpus of stopping words

In [None]:
nltk.download('stopwords')
stop_words = set(nltk.corpus.stopwords.words('english'))
"""
stopwords refers to a collection/corpus of commonly used words in a language (such as "and", "the", "is", etc.)
that are often filtered out before or after processing of natural language data.
"""

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


'\nstopwords refers to a collection/corpus of commonly used words in a language (such as "and", "the", "is", etc.)\nthat are often filtered out before or after processing of natural language data.\n'

# Function to get HTML from Page

In [None]:
def get_HTML(page):
    src = page.content
    soup = BeautifulSoup(src,'lxml')

    topics = soup.find_all('p',{'class':'topic-paragraph'})
    return topics

# Identifying the used URL

In [None]:
topics = get_HTML(requests.get(f"https://www.britannica.com/technology/Tesla-coil"))
print(topics)

[<p class="topic-paragraph"><strong><span id="ref1317582"></span>Tesla coil</strong>,  an electrical <a class="md-crosslink" data-show-preview="true" href="https://www.britannica.com/technology/transformer-electronics">transformer</a> that uses high-frequency <a class="md-crosslink" data-show-preview="true" href="https://www.britannica.com/science/alternating-current">alternating current</a> (AC) to increase voltage. Because of its extremely high voltage, the <a class="md-crosslink" data-show-preview="true" href="https://www.britannica.com/science/electricity">electricity</a> in a Tesla coil can travel through the air, powering—or damaging—nearby electronic devices, often with arcs of lightninglike electricity. Though the Tesla coil produces extremely high voltage, the high <a class="md-crosslink autoxref" data-show-preview="true" href="https://www.britannica.com/science/frequency-physics">frequency</a> of the <a class="md-crosslink" data-show-preview="true" href="https://www.britannic

# Cleaning

In [None]:
topics = [topic.text.strip() for topic in topics]
topics = [re.sub(r'[^a-zA-Z\s]', '', topic) for topic in topics]
print(topics)
"""
[^a-zA-Z0-9\s]: This is the regular expression pattern itself,
which matches any character that is not (^ inside square brackets negates the character class) a lowercase letter (a-z),
an uppercase letter (A-Z), a digit (0-9), or whitespace (\s).
"""

['Tesla coil,  an electrical transformer that uses high-frequency alternating current (AC) to increase voltage. Because of its extremely high voltage, the electricity in a Tesla coil can travel through the air, powering—or damaging—nearby electronic devices, often with arcs of lightninglike electricity. Though the Tesla coil produces extremely high voltage, the high frequency of the current generally makes it possible for most people to approach the device and even be struck by the arcs without suffering injury. The spectacular effects created by the Tesla coil make the device popular for scientific exhibitions, but the principles underlying the coil were also important to the development of radio technology.', 'The Tesla coil was invented by Serbian American inventor Nikola Tesla in 1891. Tesla was primarily interested in its potential to wirelessly transmit electricity, particularly for lighting. He hoped to build large coils scattered across Earth, each of which would provide power 

'\n[^a-zA-Z0-9\\s]: This is the regular expression pattern itself,\nwhich matches any character that is not (^ inside square brackets negates the character class) a lowercase letter (a-z),\nan uppercase letter (A-Z), a digit (0-9), or whitespace (\\s).\n'

# Normalization

In [None]:
topics = [topic.lower() for topic in topics]
print(topics)

['tesla coil  an electrical transformer that uses highfrequency alternating current ac to increase voltage because of its extremely high voltage the electricity in a tesla coil can travel through the air poweringor damagingnearby electronic devices often with arcs of lightninglike electricity though the tesla coil produces extremely high voltage the high frequency of the current generally makes it possible for most people to approach the device and even be struck by the arcs without suffering injury the spectacular effects created by the tesla coil make the device popular for scientific exhibitions but the principles underlying the coil were also important to the development of radio technology', 'the tesla coil was invented by serbian american inventor nikola tesla in  tesla was primarily interested in its potential to wirelessly transmit electricity particularly for lighting he hoped to build large coils scattered across earth each of which would provide power to any device with a re

# Tokenization

In [None]:
topics = [topic.split() for topic in topics]
print(topics)

[['tesla', 'coil', 'an', 'electrical', 'transformer', 'that', 'uses', 'highfrequency', 'alternating', 'current', 'ac', 'to', 'increase', 'voltage', 'because', 'of', 'its', 'extremely', 'high', 'voltage', 'the', 'electricity', 'in', 'a', 'tesla', 'coil', 'can', 'travel', 'through', 'the', 'air', 'poweringor', 'damagingnearby', 'electronic', 'devices', 'often', 'with', 'arcs', 'of', 'lightninglike', 'electricity', 'though', 'the', 'tesla', 'coil', 'produces', 'extremely', 'high', 'voltage', 'the', 'high', 'frequency', 'of', 'the', 'current', 'generally', 'makes', 'it', 'possible', 'for', 'most', 'people', 'to', 'approach', 'the', 'device', 'and', 'even', 'be', 'struck', 'by', 'the', 'arcs', 'without', 'suffering', 'injury', 'the', 'spectacular', 'effects', 'created', 'by', 'the', 'tesla', 'coil', 'make', 'the', 'device', 'popular', 'for', 'scientific', 'exhibitions', 'but', 'the', 'principles', 'underlying', 'the', 'coil', 'were', 'also', 'important', 'to', 'the', 'development', 'of', 'r

# Stemming

In [None]:
snowball_stemmer = SnowballStemmer("english")
topics = [snowball_stemmer.stem(word)  for topic in topics for word in topic]
print(topics)
"""
The Snowball Stemmer algorithm, also known as the Porter2 stemming algorithm,
is an algorithm for stemming words in natural language processing (NLP).
It's better than Porter1 stemming.
"""

['tesla', 'coil', 'an', 'electr', 'transform', 'that', 'use', 'highfrequ', 'altern', 'current', 'ac', 'to', 'increas', 'voltag', 'becaus', 'of', 'it', 'extrem', 'high', 'voltag', 'the', 'electr', 'in', 'a', 'tesla', 'coil', 'can', 'travel', 'through', 'the', 'air', 'poweringor', 'damagingnearbi', 'electron', 'devic', 'often', 'with', 'arc', 'of', 'lightninglik', 'electr', 'though', 'the', 'tesla', 'coil', 'produc', 'extrem', 'high', 'voltag', 'the', 'high', 'frequenc', 'of', 'the', 'current', 'general', 'make', 'it', 'possibl', 'for', 'most', 'peopl', 'to', 'approach', 'the', 'devic', 'and', 'even', 'be', 'struck', 'by', 'the', 'arc', 'without', 'suffer', 'injuri', 'the', 'spectacular', 'effect', 'creat', 'by', 'the', 'tesla', 'coil', 'make', 'the', 'devic', 'popular', 'for', 'scientif', 'exhibit', 'but', 'the', 'principl', 'under', 'the', 'coil', 'were', 'also', 'import', 'to', 'the', 'develop', 'of', 'radio', 'technolog', 'the', 'tesla', 'coil', 'was', 'invent', 'by', 'serbian', 'ame

"\nThe Snowball Stemmer algorithm, also known as the Porter2 stemming algorithm,\nis an algorithm for stemming words in natural language processing (NLP).\nIt's better than Porter1 stemming.\n"

# Removing Stopping words

In [None]:
stop_words = set(list(stop_words) + ['ac'])
topics = [word for word in topics if word.lower() not in (stop_words)]
print(topics)

['tesla', 'coil', 'electr', 'transform', 'use', 'highfrequ', 'altern', 'current', 'increas', 'voltag', 'becaus', 'extrem', 'high', 'voltag', 'electr', 'tesla', 'coil', 'travel', 'air', 'poweringor', 'damagingnearbi', 'electron', 'devic', 'often', 'arc', 'lightninglik', 'electr', 'though', 'tesla', 'coil', 'produc', 'extrem', 'high', 'voltag', 'high', 'frequenc', 'current', 'general', 'make', 'possibl', 'peopl', 'approach', 'devic', 'even', 'struck', 'arc', 'without', 'suffer', 'injuri', 'spectacular', 'effect', 'creat', 'tesla', 'coil', 'make', 'devic', 'popular', 'scientif', 'exhibit', 'principl', 'coil', 'also', 'import', 'develop', 'radio', 'technolog', 'tesla', 'coil', 'invent', 'serbian', 'american', 'inventor', 'nikola', 'tesla', 'tesla', 'primarili', 'interest', 'potenti', 'wireless', 'transmit', 'electr', 'particular', 'light', 'hope', 'build', 'larg', 'coil', 'scatter', 'across', 'earth', 'would', 'provid', 'power', 'ani', 'devic', 'receiv', 'coil', 'howev', 'littl', 'success'

# Getting unique words

In [None]:
indices = np.unique(topics, return_index=True)[1]
print(np.array(topics)[np.sort(indices)])

['tesla' 'coil' 'electr' 'transform' 'use' 'highfrequ' 'altern' 'current'
 'increas' 'voltag' 'becaus' 'extrem' 'high' 'travel' 'air' 'poweringor'
 'damagingnearbi' 'electron' 'devic' 'often' 'arc' 'lightninglik' 'though'
 'produc' 'frequenc' 'general' 'make' 'possibl' 'peopl' 'approach' 'even'
 'struck' 'without' 'suffer' 'injuri' 'spectacular' 'effect' 'creat'
 'popular' 'scientif' 'exhibit' 'principl' 'also' 'import' 'develop'
 'radio' 'technolog' 'invent' 'serbian' 'american' 'inventor' 'nikola'
 'primarili' 'interest' 'potenti' 'wireless' 'transmit' 'particular'
 'light' 'hope' 'build' 'larg' 'scatter' 'across' 'earth' 'would' 'provid'
 'power' 'ani' 'receiv' 'howev' 'littl' 'success' 'plan' 'gave' 'lectur'
 'demonstr' 'transmiss' 'propos' 'signal' 'conjunct' 'obtain' 'patent'
 'describ' 'consid' 'first' 'rais' 'lower' 'measur' 'generat' 'veri'
 'excess' 'one' 'million' 'volt' 'mean' 'circuit' 'chang' 'modern' 'usual'
 'consist' 'initi' 'boost' 'sourc' 'send' 'capacitor' 'attach' 