## Importing Libraries

In [1]:
import requests
from bs4 import BeautifulSoup as bs
from urllib.request import urlopen
import logging
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\anshk\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\anshk\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anshk\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Link to the website used

In [2]:
url = 'https://www.healthline.com/directory/topics'

### Webpage Data Retrieval and Parsing

In [3]:
# Open a connection to the URL and read the page data
urlclient = urlopen(url)
page_data = urlclient.read()

# Parse the HTML content of the page using BeautifulSoup
page_data_html = bs(page_data, 'html.parser')

# Find all anchor elements with a specific CSS class
all_data_names = page_data_html.find_all("a", class_="css-1hacg05")

In [4]:
# Print the number of elements found
len(all_data_names)

150

### Webpage Data Scraping and Processing

In [5]:
data_list = []
    
# Iterate through the anchor elements and scrape data from each linked webpage
for i in all_data_names:
    page_link = i['href']  # Get the URL from the 'href' attribute
    page_data = requests.get(page_link)  # Send an HTTP request to the URL
    page_data_html = bs(page_data.text, "html.parser")  # Parse the HTML content of the linked webpage
    paragraphs = page_data_html.find_all('p')  # Find all 'p' elements (paragraphs)
    all_text = [para.get_text() for para in paragraphs]  # Extract text from paragraphs
    data_string = ''.join(all_text)  # Join the extracted text to create a single string
    data_list.append(data_string)  # Append the string to the data_list

### Text Analysis Functions

In [6]:
# Function to count sentences in a given text
def count_sentences(text):
    sentences = sent_tokenize(text)
    return len(sentences)

# Function to count words in a given text
def count_words(text):
    words = word_tokenize(text)
    return len(words)

# Function to count nouns in a given text using part-of-speech tagging
def count_nouns(text):
    words = word_tokenize(text)
    tagged_words = pos_tag(words)
    nouns = [word for word, pos in tagged_words if pos.startswith('NN')]
    return len(nouns)

# Function to count verbs in a given text using part-of-speech tagging
def count_verbs(text):
    words = word_tokenize(text)
    tagged_words = pos_tag(words)
    verbs = [word for word, pos in tagged_words if pos.startswith('VB')]
    return len(verbs)

# Function to count stopwords in a given text
def count_stopwords(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    stopwords_count = len([w for w in word_tokens if w.lower() in stop_words])
    return stopwords_count

### Store Webpage Statistics

In [7]:
# Create an empty list to store statistics for each webpage
statistics = []

# Iterating through the scraped data and calculate statistics
for page_text, page_link in zip(data_list, [i['href'] for i in all_data_names]):
    
    # Creating a dictionary to store statistics for the current webpage
    page_stats = {
        'page_link': page_link,                 # Store the webpage link in the dictionary
        'num_sentences': count_sentences(page_text),  # Calculate the number of sentences
        'num_words': count_words(page_text),        # Calculate the number of words
        'num_nouns': count_nouns(page_text),        # Calculate the number of nouns
        'num_verbs': count_verbs(page_text),        # Calculate the number of verbs
        'num_stopwords': count_stopwords(page_text)  # Calculate the number of stopwords
    }
    statistics.append(page_stats) 


In [8]:
import csv

filename = 'nlp_statistics.csv'

fields = ['page_link','num_sentences', 'num_words', 'num_nouns', 'num_verbs', 'num_stopwords']

# Writing to csv file
with open(filename, 'w', newline='') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fields)

    writer.writeheader()
    writer.writerows(statistics)

In [9]:
import csv

# Calculating the average of each statistic
average_statistics = {
    'avg_num_sentences': sum(stat['num_sentences'] for stat in statistics) / len(statistics),
    'avg_num_words': sum(stat['num_words'] for stat in statistics) / len(statistics),
    'avg_num_nouns': sum(stat['num_nouns'] for stat in statistics) / len(statistics),
    'avg_num_verbs': sum(stat['num_verbs'] for stat in statistics) / len(statistics),
    'avg_num_stopwords': sum(stat['num_stopwords'] for stat in statistics) / len(statistics),
}


aggregated_results_file = 'aggregated_results.csv'

with open(aggregated_results_file, 'w', newline='') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=average_statistics.keys())
    writer.writeheader()
    writer.writerow(average_statistics)

print(aggregated_results_file)


aggregated_results.csv
