Import Required Libraries

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import re
import os

Download NLTK resources

In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Load input data

In [3]:
input_df = pd.read_excel("input.xlsx")

Load positive and negative word lists

In [4]:
with open("positive-words.txt", "r") as f:
    positive_words = f.read().splitlines()

with open("negative-words.txt", "r") as f:
    negative_words = f.read().splitlines()

Load stop words

In [5]:
stop_words = set(stopwords.words('english'))
stopwords_dir = "StopWords"  # here StopWords files are in a directory named "StopWords"

for filename in os.listdir(stopwords_dir):
    if filename.startswith("StopWords_") and filename.endswith(".txt"):
        with open(os.path.join(stopwords_dir, filename), "r") as f:
            stop_words.update(f.read().splitlines())

Function to extract article text

In [6]:
def extract_article_text(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check for HTTP errors
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find article title and text (this may need to be adjusted based on website structure)
        title = soup.find('h1').get_text().strip()
        article_content = soup.find('div', class_='td-post-content').get_text().strip()

        return title + '\n\n' + article_content 
    except requests.exceptions.RequestException as e:
        print(f"{url_id}: {e}")
        return ""

Function to calculate syllables

In [7]:
def count_syllables(word):
    word = word.lower()
    count = 0
    vowels = 'aeiouy'
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith('e'):# silent 'e' at the end of a word does not contribute to syllable count. (common rule in english)
        count -= 1
    if word.endswith('le') and len(word) > 2 and word[-3] not in vowels: # ex: word = able has 2 syllables.
        count += 1
    return count

Function to perform text analysis

In [9]:
def analyze_text(text):
    if not text:  # Handle empty text case
        return [0] * 13  # Return zeros for all variables

    # Tokenize text
    sentences = sent_tokenize(text)
    words = word_tokenize(text)
    cleaned_words = [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]

    # Sentiment analysis
    positive_score = sum([1 for word in cleaned_words if word in positive_words])
    negative_score = sum([1 for word in cleaned_words if word in negative_words])
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (len(cleaned_words) + 0.000001)

    # Readability
    avg_sentence_length = len(words) / len(sentences)
    complex_word_count = sum([1 for word in cleaned_words if count_syllables(word) > 2])
    percentage_complex_words = complex_word_count / len(cleaned_words)
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)

    # Other variables
    word_count = len(cleaned_words)
    syllable_per_word = ([count_syllables(word) for word in cleaned_words])
    personal_pronouns = len(re.findall(r'\b(I|we|my|ours|us)\b', text, re.IGNORECASE))  # Count personal pronouns
    avg_word_length = sum(len(word) for word in cleaned_words) / word_count

    return [
        positive_score, negative_score, polarity_score, subjectivity_score,
        avg_sentence_length, percentage_complex_words, fog_index,
        avg_sentence_length,  # AVG NUMBER OF WORDS PER SENTENCE is the same as AVG SENTENCE LENGTH
        complex_word_count, word_count, syllable_per_word,
        personal_pronouns, avg_word_length
    ]

Process each URL

In [10]:
output_data = []
for index, row in input_df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    
    # Extract text and save to file
    article_text = extract_article_text(url)
    with open(f"{url_id}.txt", "w", encoding="utf-8") as f:
        f.write(article_text)

    # Analyze text
    analysis_results = analyze_text(article_text)

    # Combine input data and analysis results
    output_data.append([url_id, url] + analysis_results)

blackassign0036: 404 Client Error: Not Found for url: https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/
blackassign0049: 404 Client Error: Not Found for url: https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/


Create output DataFrame

In [11]:
output_df = pd.DataFrame(output_data, columns=[
    'URL_ID', 'URL', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE',
    'SUBJECTIVITY SCORE', 'AVG SENTENCE'   'LENGTH', 'PERCENTAGE OF COMPLEX WORDS',
    'FOG INDEX', 'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT',
    'WORD COUNT', 'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH'
])

Save output to Excel file

In [12]:
output_df.to_excel("Output Data Structure.xlsx", index=False)

print("Text extraction and analysis complete. Output saved to 'Output Data Structure.xlsx'")

Text extraction and analysis complete. Output saved to 'Output Data Structure.xlsx'
