In [1]:
import nltk 
               
# Cleaning the texts
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer


In [2]:
paragraph =  """Coronavirus disease 2019 (COVID-19) is a contagious disease caused by the coronavirus SARS-CoV-2.
The first known case was identified in Wuhan, China, in December 2019.
[7] The disease quickly spread worldwide, resulting in the ongoing COVID-19 pandemic.
The symptoms of COVID‑19 are variable but often include fever,[8] fatigue, cough,
breathing difficulties, loss of smell, and loss of taste.[9][10][11] 
Symptoms may begin one to fourteen days after exposure to the virus. 
At least a third of people who are infected do not develop noticeable symptoms.
[12][13] Of those who develop symptoms noticeable enough to be classified as patients, 
most (81%) develop mild to moderate symptoms (up to mild pneumonia), 
while 14% develop severe symptoms (dyspnea, hypoxia, or more than 50% lung involvement on imaging),
 and 5% develop critical symptoms (respiratory failure, shock, or multiorgan dysfunction).
 [14] Older people are at a higher risk of developing severe symptoms. 
 Some complications result in death. Some people continue to experience a range of effects 
 (long COVID) for months or years after infection, and damage to organs has been observed.
 [15] Multi-year studies are underway to further investigate the long-term effects of the disease.
[16]
COVID‑19 transmission occurs when infectious particles are breathed in or come into contact 
with the eyes, nose, or mouth. The risk is highest when people are in close proximity, 
but small airborne particles containing the virus can remain suspended in the air and travel 
over longer distances, particularly indoors. Transmission can also occur when people touch 
their eyes, nose or mouth after touching surfaces or objects that have been contaminated by 
the virus. People remain contagious for up to 20 days and can spread the virus even if they 
do not develop symptoms.[17]
Testing methods for COVID-19 to detect the virus's nucleic acid include real-time reverse 
transcription polymerase chain reaction (RT‑PCR),[18][19] transcription-mediated amplification,
[18][19][20] and reverse transcription loop-mediated isothermal amplification (RT‑LAMP)[18][19] 
from a nasopharyngeal swab.[21]
Several COVID-19 vaccines have been approved and distributed in various countries, 
many of which have initiated mass vaccination campaigns. Other preventive measures 
include physical or social distancing, quarantining, ventilation of indoor spaces, 
use of face masks or coverings in public, covering coughs and sneezes, hand washing, 
and keeping unwashed hands away from the face. While drugs have been developed to inhibit 
the virus, the primary treatment is still symptomatic, managing the disease through supportive 
care, isolation, and experimental measures."""


#ps = PorterStemmer()
wordnet=WordNetLemmatizer()
sentences = nltk.sent_tokenize(paragraph)
corpus = []

In [3]:
sentences

['Coronavirus disease 2019 (COVID-19) is a contagious disease caused by the coronavirus SARS-CoV-2.',
 'The first known case was identified in Wuhan, China, in December 2019.',
 '[7] The disease quickly spread worldwide, resulting in the ongoing COVID-19 pandemic.',
 'The symptoms of COVID‑19 are variable but often include fever,[8] fatigue, cough,\nbreathing difficulties, loss of smell, and loss of taste.',
 '[9][10][11] \nSymptoms may begin one to fourteen days after exposure to the virus.',
 'At least a third of people who are infected do not develop noticeable symptoms.',
 '[12][13] Of those who develop symptoms noticeable enough to be classified as patients, \nmost (81%) develop mild to moderate symptoms (up to mild pneumonia), \nwhile 14% develop severe symptoms (dyspnea, hypoxia, or more than 50% lung involvement on imaging),\n and 5% develop critical symptoms (respiratory failure, shock, or multiorgan dysfunction).',
 '[14] Older people are at a higher risk of developing severe

In [4]:
for i in range(len(sentences)):
    review = re.sub('[^a-zA-Z]', ' ', sentences[i])
    review = review.lower()
    review = review.split()
    review = [wordnet.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)


In [5]:
corpus

['coronavirus disease covid contagious disease caused coronavirus sars cov',
 'first known case identified wuhan china december',
 'disease quickly spread worldwide resulting ongoing covid pandemic',
 'symptom covid variable often include fever fatigue cough breathing difficulty loss smell loss taste',
 'symptom may begin one fourteen day exposure virus',
 'least third people infected develop noticeable symptom',
 'develop symptom noticeable enough classified patient develop mild moderate symptom mild pneumonia develop severe symptom dyspnea hypoxia lung involvement imaging develop critical symptom respiratory failure shock multiorgan dysfunction',
 'older people higher risk developing severe symptom',
 'complication result death',
 'people continue experience range effect long covid month year infection damage organ observed',
 'multi year study underway investigate long term effect disease',
 'covid transmission occurs infectious particle breathed come contact eye nose mouth',
 'risk

In [6]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()

In [9]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])