# NLP Libraries

In [None]:
import os
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import spacy
import re, string, unicodedata
from nltk import word_tokenize, sent_tokenize
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from wordcloud import WordCloud

# Visual Libraries

In [None]:
import matplotlib.pyplot as plt
from textblob import TextBlob

# ML Libraries

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
nlp = spacy.load("en_core_web_sm") 

In [None]:
tree = ET.parse("covid_data.xml")
root = tree.getroot()

In [None]:
tree 

In [None]:
root

In [None]:

root=ET.tostring(root, encoding='utf8').decode('utf8')

In [None]:
print(root)

In [None]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
def remove_between_square_brackets(text):
    return re.sub(r'\[[^]]*\]', '', text)

In [None]:
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    text=re.sub('  ','',text)
    return text

In [None]:
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    text = re.sub(r'\s+', ' ', text)
    return text

In [None]:
sample = denoise_text(root)

In [None]:
print(sample)

In [None]:
def preprocess_text(text):
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    tokens = [t for t in tokens if t.isalpha() and t not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return " ".join(tokens)

In [None]:
tree = ET.parse("covid_data.xml")
root = tree.getroot()    # <-- keep this as XML, do NOT convert to string

documents = []
for article in root.findall('article'):
    body_node = article.find('body')
    body = body_node.text if body_node is not None else ""
    clean_text = denoise_text(body)
    preprocessed = preprocess_text(clean_text)
    documents.append(preprocessed)

## NER

In [None]:
print("=== NER ===")
for i, doc_text in enumerate(documents):
    doc = nlp(doc_text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    print(f"\nArticle {i+1}: {entities}")

# TFIDF

In [None]:
tfidf = TfidfVectorizer(max_features=50)
tfidf_matrix = tfidf.fit_transform(documents)
print(tfidf_matrix)

In [None]:
print("\n=== TF-IDF Features ===")
print(tfidf.get_feature_names_out())

# BOW

In [None]:
bow = CountVectorizer(max_features=50)
bow_matrix = bow.fit_transform(documents)
print(bow_matrix)

In [None]:
print("\n=== BoW Features ===")
print(bow.get_feature_names_out())

In [None]:
all_text = " ".join(documents)

In [None]:
wordcloud = WordCloud(width=1000, height=600, background_color="white").generate(all_text)

In [None]:
plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("WordCloud")
plt.show()

In [None]:
print("\n=== SENTIMENT ANALYSIS ===")
for i, text in enumerate(documents):
    sentiment = TextBlob(text).sentiment.polarity
    print(f"Article {i+1} Sentiment Score: {sentiment}")

In [None]:
labels = [1 if i % 2 == 0 else 0 for i in range(len(documents))]  # dummy binary labels

In [None]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, labels, test_size=0.3, random_state=42)

In [None]:
svm_model = SVC()
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)

In [None]:
print("\n=== SVM Accuracy ===")
print(accuracy_score(y_test, svm_pred))
print(classification_report(y_test, svm_pred))

In [None]:
log_model = LogisticRegression(max_iter=200)
log_model.fit(X_train, y_train)
log_pred = log_model.predict(X_test)

In [None]:
print("\n=== Logistic Regression Accuracy ===")
print(accuracy_score(y_test, log_pred))
print(classification_report(y_test, log_pred))