In [10]:
import requests
import re
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [9]:
import nltk
nltk.download()


NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q


True

In [3]:
url = "https://en.wikipedia.org/wiki/French_Revolution"
response = requests.get(url)

#Fetch the html content
html_content = response.content

#Parse the html content
soup = BeautifulSoup(html_content,'html.parser')

In [4]:
#Extracting all paragraphs
paragraphs = soup.find_all('p')

# Extracting text from each paragraph
paragraph_text  = [paragrpah.get_text() for paragrpah in paragraphs]
print(paragraph_text)

['\n', '\n', 'The French Revolution[a] was a period of political and societal change in France that began with the Estates General of 1789, and ended with the coup of 18 Brumaire on November 1799 and the formation of the French Consulate. Many of its ideas are considered fundamental principles of liberal democracy,[1] while its values and institutions remain central to modern French political discourse.[2]\n', 'The causes are generally agreed to be a combination of social, political and economic factors, which the Ancien Régime proved unable to manage. A financial crisis and widespread social distress led, in May 1789, to the convocation of the Estates General which was converted into a National Assembly in June. The Storming of the Bastille on 14 July led to a series of radical measures by the Assembly, among them the abolition of feudalism, state control over the Catholic Church in France, and a declaration of rights.\n', 'The next three years were dominated by the struggle for polit

In [5]:
#converting everything into lowercase
lowercase  = [text.lower() for text in paragraph_text]
print(lowercase)

['\n', '\n', 'the french revolution[a] was a period of political and societal change in france that began with the estates general of 1789, and ended with the coup of 18 brumaire on november 1799 and the formation of the french consulate. many of its ideas are considered fundamental principles of liberal democracy,[1] while its values and institutions remain central to modern french political discourse.[2]\n', 'the causes are generally agreed to be a combination of social, political and economic factors, which the ancien régime proved unable to manage. a financial crisis and widespread social distress led, in may 1789, to the convocation of the estates general which was converted into a national assembly in june. the storming of the bastille on 14 july led to a series of radical measures by the assembly, among them the abolition of feudalism, state control over the catholic church in france, and a declaration of rights.\n', 'the next three years were dominated by the struggle for polit

In [6]:
#Remove the special characters
cleaned_text = [re.sub(r'[^a-zA-Z0-9\s]', '', text) for text in lowercase]
print(cleaned_text)

['\n', '\n', 'the french revolutiona was a period of political and societal change in france that began with the estates general of 1789 and ended with the coup of 18 brumaire on november 1799 and the formation of the french consulate many of its ideas are considered fundamental principles of liberal democracy1 while its values and institutions remain central to modern french political discourse2\n', 'the causes are generally agreed to be a combination of social political and economic factors which the ancien rgime proved unable to manage a financial crisis and widespread social distress led in may 1789 to the convocation of the estates general which was converted into a national assembly in june the storming of the bastille on 14 july led to a series of radical measures by the assembly among them the abolition of feudalism state control over the catholic church in france and a declaration of rights\n', 'the next three years were dominated by the struggle for political control exacerba

In [None]:
'''Tokenization, in the realm of Natural Language Processing (NLP) and machine learning, refers to the process of converting a sequence of text into
smaller parts, known as tokens. These tokens can be as small as characters or as long as words.'''

In [7]:
tokenized_text = [word_tokenize(text) for text in cleaned_text]
print(tokenized_text)

[[], [], ['the', 'french', 'revolutiona', 'was', 'a', 'period', 'of', 'political', 'and', 'societal', 'change', 'in', 'france', 'that', 'began', 'with', 'the', 'estates', 'general', 'of', '1789', 'and', 'ended', 'with', 'the', 'coup', 'of', '18', 'brumaire', 'on', 'november', '1799', 'and', 'the', 'formation', 'of', 'the', 'french', 'consulate', 'many', 'of', 'its', 'ideas', 'are', 'considered', 'fundamental', 'principles', 'of', 'liberal', 'democracy1', 'while', 'its', 'values', 'and', 'institutions', 'remain', 'central', 'to', 'modern', 'french', 'political', 'discourse2'], ['the', 'causes', 'are', 'generally', 'agreed', 'to', 'be', 'a', 'combination', 'of', 'social', 'political', 'and', 'economic', 'factors', 'which', 'the', 'ancien', 'rgime', 'proved', 'unable', 'to', 'manage', 'a', 'financial', 'crisis', 'and', 'widespread', 'social', 'distress', 'led', 'in', 'may', '1789', 'to', 'the', 'convocation', 'of', 'the', 'estates', 'general', 'which', 'was', 'converted', 'into', 'a', 'na

In [None]:
'''
Stop words are a set of commonly used words in a language. Examples of stop words in English are “a,” “the,” “is,” “are,” etc.
Stop words are commonly used in Text Mining and Natural Language Processing (NLP) to eliminate words that are so widely used that they carry
very little useful information.
'''

In [13]:
#Remove Stop_words - means removing a, an, the from the sentence
stop_words = set(stopwords.words('english'))
#print(stop_words)

filtered_text = [[word for word in tokens if word not in stop_words] for tokens in tokenized_text]
print(filtered_text)

{'ours', 'is', 'herself', 'from', 'should', 'about', 'under', 'itself', 'been', 'all', 'each', 'before', 'but', 'hasn', 'which', "hasn't", 'shan', 'were', 'other', 'few', 'll', 'me', "weren't", 'how', 'ourselves', 'isn', 'or', 'by', 'y', 'below', 'ain', 'between', 'ma', 'o', "didn't", 'of', 'mightn', 'needn', 'where', "you'll", 'there', 'couldn', 'over', "couldn't", 'them', 'again', 'with', 'has', 'no', 'as', 'didn', 'theirs', 'he', 'further', "hadn't", 'am', 'myself', 'his', 'into', 'such', 'wouldn', 'you', "that'll", 'for', 'yourselves', 'yourself', "should've", 's', "isn't", 'd', 'did', "mustn't", 'only', 'its', 'our', "you've", 'they', 'through', 'and', 'during', 'to', "doesn't", 'down', 'this', 'shouldn', 'hers', 'she', 'so', 'be', "aren't", 'then', 'when', "don't", 'don', 'until', "won't", 'whom', 'was', 'himself', 'a', 'aren', "shan't", 'more', 'being', 'having', 'will', 'very', 'nor', "mightn't", 'if', 'off', 'most', "you'd", 'it', 'than', 'above', 'at', 'some', 'both', 'now', 

In [14]:
# Stemming - Creating, Creative, Created = Create
stemmer = PorterStemmer()
stemmed_text = [[stemmer.stem(word) for word in tokens] for tokens in filtered_text]
print(stemmed_text)

[[], [], ['french', 'revolutiona', 'period', 'polit', 'societ', 'chang', 'franc', 'began', 'estat', 'gener', '1789', 'end', 'coup', '18', 'brumair', 'novemb', '1799', 'format', 'french', 'consul', 'mani', 'idea', 'consid', 'fundament', 'principl', 'liber', 'democracy1', 'valu', 'institut', 'remain', 'central', 'modern', 'french', 'polit', 'discourse2'], ['caus', 'gener', 'agre', 'combin', 'social', 'polit', 'econom', 'factor', 'ancien', 'rgime', 'prove', 'unabl', 'manag', 'financi', 'crisi', 'widespread', 'social', 'distress', 'led', 'may', '1789', 'convoc', 'estat', 'gener', 'convert', 'nation', 'assembl', 'june', 'storm', 'bastil', '14', 'juli', 'led', 'seri', 'radic', 'measur', 'assembl', 'among', 'abolit', 'feudal', 'state', 'control', 'cathol', 'church', 'franc', 'declar', 'right'], ['next', 'three', 'year', 'domin', 'struggl', 'polit', 'control', 'exacerb', 'econom', 'depress', 'seri', 'militari', 'defeat', 'follow', 'outbreak', 'french', 'revolutionari', 'war', 'april', '1792', 

In [15]:
# Remove empty tokens
final_text = [[word for word in tokens if word.strip()] for tokens in stemmed_text]
print(final_text)

[[], [], ['french', 'revolutiona', 'period', 'polit', 'societ', 'chang', 'franc', 'began', 'estat', 'gener', '1789', 'end', 'coup', '18', 'brumair', 'novemb', '1799', 'format', 'french', 'consul', 'mani', 'idea', 'consid', 'fundament', 'principl', 'liber', 'democracy1', 'valu', 'institut', 'remain', 'central', 'modern', 'french', 'polit', 'discourse2'], ['caus', 'gener', 'agre', 'combin', 'social', 'polit', 'econom', 'factor', 'ancien', 'rgime', 'prove', 'unabl', 'manag', 'financi', 'crisi', 'widespread', 'social', 'distress', 'led', 'may', '1789', 'convoc', 'estat', 'gener', 'convert', 'nation', 'assembl', 'june', 'storm', 'bastil', '14', 'juli', 'led', 'seri', 'radic', 'measur', 'assembl', 'among', 'abolit', 'feudal', 'state', 'control', 'cathol', 'church', 'franc', 'declar', 'right'], ['next', 'three', 'year', 'domin', 'struggl', 'polit', 'control', 'exacerb', 'econom', 'depress', 'seri', 'militari', 'defeat', 'follow', 'outbreak', 'french', 'revolutionari', 'war', 'april', '1792', 

In [16]:
# Convert tokens back to sentences
sentences = [' '.join(tokens) for tokens in final_text]
print(sentences)

['', '', 'french revolutiona period polit societ chang franc began estat gener 1789 end coup 18 brumair novemb 1799 format french consul mani idea consid fundament principl liber democracy1 valu institut remain central modern french polit discourse2', 'caus gener agre combin social polit econom factor ancien rgime prove unabl manag financi crisi widespread social distress led may 1789 convoc estat gener convert nation assembl june storm bastil 14 juli led seri radic measur assembl among abolit feudal state control cathol church franc declar right', 'next three year domin struggl polit control exacerb econom depress seri militari defeat follow outbreak french revolutionari war april 1792 result insurrect 10 august 1792 monarchi abolish replac french first republ septemb loui xvi execut januari 1793', 'anoth parisbas revolt june 1793 constitut suspend effect polit power pass nation convent committe public safeti estim 16000 execut subsequ reign terror end juli 1794 weaken extern threat i

In [18]:
# Convert sentences back to paragraphs
processed_paragraphs = '\n\n'.join(sentences)

In [19]:
#Save processed text
with open('processed_text.txt', 'w', encoding='utf-8') as file:
    file.write(processed_paragraphs)

print(processed_paragraphs)






french revolutiona period polit societ chang franc began estat gener 1789 end coup 18 brumair novemb 1799 format french consul mani idea consid fundament principl liber democracy1 valu institut remain central modern french polit discourse2

caus gener agre combin social polit econom factor ancien rgime prove unabl manag financi crisi widespread social distress led may 1789 convoc estat gener convert nation assembl june storm bastil 14 juli led seri radic measur assembl among abolit feudal state control cathol church franc declar right

next three year domin struggl polit control exacerb econom depress seri militari defeat follow outbreak french revolutionari war april 1792 result insurrect 10 august 1792 monarchi abolish replac french first republ septemb loui xvi execut januari 1793

anoth parisbas revolt june 1793 constitut suspend effect polit power pass nation convent committe public safeti estim 16000 execut subsequ reign terror end juli 1794 weaken extern threat intern opposi