In [1]:
#!pip install syllables

import requests
from bs4 import BeautifulSoup
import pandas as pd
from textblob import TextBlob
import nltk
import re
import syllables  # Add this line for syllable counting

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\1pava\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\1pava\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Explanation:
beautifulsoup4: For parsing HTML content from websites.

requests: To make HTTP requests and download webpage content.

openpyxl: To read and write data to Excel files.

nltk: For natural language processing tasks like sentiment analysis and text preprocessing.

textstat: For calculating readability statistics like FOG Index and average sentence length.

In [2]:
# Function to clean and preprocess text
def clean_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove non-alphanumeric characters
    text = re.sub(r'\s+', ' ', text, flags=re.I)  # Replace multiple spaces with a single space
    text = text.lower()  # Convert to lowercase
    return text

# Function to calculate derived variables
def calculate_derived_variables(text):
    # Perform text analysis using TextBlob
    blob = TextBlob(text)
    
    # Sentiment Analysis
    positive_score = sum(1 for word in blob.words if word.lower() in positive_words)
    negative_score = sum(1 for word in blob.words if word.lower() in negative_words)
    polarity_score = (positive_score - negative_score) / (positive_score + negative_score + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (len(blob.words) + 0.000001)

    # Readability Analysis
    avg_sentence_length = len(blob.words) / len(blob.sentences)
    percentage_complex_words = sum(1 for word in blob.words if syllables.estimate(word) > 2) / len(blob.words)
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    avg_words_per_sentence = len(blob.words) / len(blob.sentences)

    # Additional Variables
    complex_word_count = sum(1 for word in blob.words if syllables.estimate(word) > 2)
    word_count = len(blob.words)
    syllable_per_word = sum(syllables.estimate(word) for word in blob.words) / len(blob.words)
    personal_pronouns = sum(1 for word in blob.words if word.lower() in personal_pronouns_list)
    avg_word_length = sum(len(word) for word in blob.words) / len(blob.words)

    return positive_score, negative_score, polarity_score, subjectivity_score, \
           avg_sentence_length, percentage_complex_words, fog_index, avg_words_per_sentence, \
           complex_word_count, word_count, syllable_per_word, personal_pronouns, avg_word_length


In [3]:
# Read URLs from the input file
input_file_path = r"C:\Users\1pava\Documents\ALL PROJECTS\Web Scrapping\Output Data Structure.xlsx"
df = pd.read_excel(input_file_path)

In [4]:
# Load Positive and Negative words
with open(r"C:\Users\1pava\Documents\ALL PROJECTS\Web Scrapping\negative-words.txt", 'r') as file:
    negative_words = set(file.read().splitlines())

with open(r"C:\Users\1pava\Documents\ALL PROJECTS\Web Scrapping\positive-words.txt", 'r') as file:
    positive_words = set(file.read().splitlines())

# Load Personal Pronouns list
personal_pronouns_list = ['i', 'we', 'my', 'ours', 'us']

# Create an empty DataFrame to store the results
output_df = pd.DataFrame(columns=df.columns)


In [6]:
# Create an empty list to store the dictionaries
output_data = []

# Iterate through each row in the input DataFrame
for index, row in df.iterrows():
    url = row['URL']
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
         # Extract the title
        title_element = soup.find('title')
        article_title = title_element.text if title_element else "No title found"

        # Extract the main content of the article (adjust the class or tag accordingly)
        main_content_element = soup.find('div', class_='td-post-content')  # Adjust the class accordingly
        if main_content_element:
            # Remove unwanted elements (e.g., headers, footers)
            unwanted_elements = main_content_element.find_all(['header', 'footer'])  # Add more if needed
            for unwanted_element in unwanted_elements:
                unwanted_element.decompose()

            # Find all <p> tags within the specified div
            paragraphs = main_content_element.find_all('p')

            # Concatenate the text from all <p> tags to form the complete content
            article_text = ' '.join([paragraph.get_text() for paragraph in paragraphs])

            # Clean and preprocess the text
            cleaned_text = clean_text(article_text)

            # Calculate derived variables
            positive_score, negative_score, polarity_score, subjectivity_score, \
            avg_sentence_length, percentage_complex_words, fog_index, avg_words_per_sentence, \
            complex_word_count, word_count, syllable_per_word, personal_pronouns, avg_word_length = calculate_derived_variables(cleaned_text)

            # Append the results to the list
            output_data.append({
                'URL_ID': row['URL_ID'],
                'URL': url,
                'POSITIVE SCORE': positive_score,
                'NEGATIVE SCORE': negative_score,
                'POLARITY SCORE': polarity_score,
                'SUBJECTIVITY SCORE': subjectivity_score,
                'AVG SENTENCE LENGTH': avg_sentence_length,
                'PERCENTAGE OF COMPLEX WORDS': percentage_complex_words,
                'FOG INDEX': fog_index,
                'AVG NUMBER OF WORDS PER SENTENCE': avg_words_per_sentence,
                'COMPLEX WORD COUNT': complex_word_count,
                'WORD COUNT': word_count,
                'SYLLABLE PER WORD': syllable_per_word,
                'PERSONAL PRONOUNS': personal_pronouns,
                'AVG WORD LENGTH': avg_word_length
            })

# Create the output DataFrame from the list of dictionaries
output_df = pd.DataFrame(output_data)

# Save the output DataFrame to a new Excel file
output_file_path = r"C:\Users\1pava\Documents\ALL PROJECTS\Web Scrapping\Output_Result.xlsx"
output_df.to_excel(output_file_path, index=False)
