In [6]:
Paragraph = """
The Bank of England is the central bank of the United Kingdom and the model on which most modern central banks have been based. 
Established in 1694 to act as the English Government's banker and debt manager, and still one of the bankers for the government 
of the United Kingdom, it is the world's second oldest central bank.[3]

The bank was privately owned by stockholders from its foundation in 1694 until it was nationalised in 1946 by the Attlee 
ministry.[4] In 1998 it became an independent public organisation, wholly owned by the Treasury Solicitor on behalf of the 
government,[5] with a mandate to support the economic policies of the government of the day,[6] but independence in maintaining 
price stability.[7] In the 21st century the bank took on increased responsibility for maintaining and monitoring financial 
stability in the UK, and it increasingly functions as a statutory regulator.[8]
"""

In [7]:
print(Paragraph)


The Bank of England is the central bank of the United Kingdom and the model on which most modern central banks have been based. Established in 1694 to act as the English Government's banker and debt manager, and still one of the bankers for the government of the United Kingdom, it is the world's second oldest central bank.[3]

The bank was privately owned by stockholders from its foundation in 1694 until it was nationalised in 1946 by the Attlee ministry.[4] In 1998 it became an independent public organisation, wholly owned by the Treasury Solicitor on behalf of the government,[5] with a mandate to support the economic policies of the government of the day,[6] but independence in maintaining price stability.[7] In the 21st century the bank took on increased responsibility for maintaining and monitoring financial stability in the UK, and it increasingly functions as a statutory regulator.[8]



In [8]:
import nltk
import re
import numpy as np
import warnings

from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
warnings.filterwarnings("ignore")

In [11]:
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [20]:
# Tokenize Sentences
sentences = sent_tokenize(Paragraph)

# Stopwords fallback
try:
    stop_words = set(stopwords.words("english"))
except:
    nltk.download('stopwords')
    stop_words = set(stopwords.words("english"))

In [21]:
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [23]:
# Preprocessing text
corpus = []
for sentence in sentences:
    review = re.sub(r'[^a-zA-Z]', ' ', sentence)  # Remove non-alphabetic characters
    review = review.lower()  # Convert to lowercase
    review = review.split()  # Tokenize words
    
    # Stemming
    review = [ps.stem(word) for word in review if word not in stop_words]
    
    # Join words back into a single string
    review = ' '.join(review)
    
    corpus.append(review)

# Creating the Bag of Words (BoW) model
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()

# Print results
print("Bag of Words Model (First 5 Rows):")
print(X[:])

Bag of Words Model (First 5 Rows):
[[0 0 3 0 1 0 0 2 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]
 [1 0 1 2 0 0 0 1 0 0 1 0 0 1 1 0 0 0 2 0 0 0 1 0 1 0 0 0 0 0 0 1 1 0 0 0
  0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 1]
 [0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0
  0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 1 0 0 1 0 1 0 0 0 0 0 0 2 0 0 2 0 1 0 1 0 0 0 0 0 0 0 1 1 1
  1 0 1 0 0 0 1 0 1 0 0 0 1 0 1 0 0 1 0]
 [0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 0 1 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0
  0 0 0 1 1 0 0 1 1 1 0 0 0 1 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
