<a href="https://colab.research.google.com/github/BHAWESHBHASKAR/Data-Analysis-Projects/blob/main/Web%20Scraping%20and%20Readability%20Analysis%20of%20Articles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
!pip install textstat


Collecting textstat
  Downloading textstat-0.7.3-py3-none-any.whl (105 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyphen (from textstat)
  Downloading pyphen-0.15.0-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m35.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.15.0 textstat-0.7.3


In [None]:
from textstat import syllable_count

In [None]:
input_file = 'Input.xlsx'
df_input = pd.read_excel(input_file)


In [None]:
def extract_article_text(url):
    response = requests.get(url)
    html_content = response.text
    soup = BeautifulSoup(html_content, "html.parser")
    article = soup.find("article")
    if article:
        title = article.find("h1").get_text().strip()
        text = article.get_text(separator='\n').strip()
        return title, text
    else:
        return None, None


In [None]:
with open('StopWords_Generic.txt', 'r') as file:
    stop_words_generic = set(file.read().splitlines())
with open('StopWords_Auditor.txt', 'r') as file:
    stop_words_auditor = set(file.read().splitlines())


In [None]:
output_data = []


In [None]:
for index, row in df_input.iterrows():
    url_id = row['URL_ID']
    url = row['URL']

    title, text = extract_article_text(url)

    if text:
        tokens = word_tokenize(text)
        cleaned_tokens = [word.lower() for word in tokens if word.lower() not in stop_words_generic and word.lower() not in stop_words_auditor]

        num_words = len(cleaned_tokens)
        num_sentences = text.count('.') + text.count('!') + text.count('?')
        average_sentence_length = num_words / num_sentences if num_sentences > 0 else 0

        complex_words = [word for word in cleaned_tokens if len(word) > 2]
        percentage_complex_words = len(complex_words) / num_words if num_words > 0 else 0
        fog_index = 0.4 * (average_sentence_length + percentage_complex_words)

        average_words_per_sentence = num_words / num_sentences if num_sentences > 0 else 0
        complex_word_count = len(complex_words)
        syllable_per_word = sum(syllable_count(word) for word in cleaned_tokens) / num_words if num_words > 0 else 0

        personal_pronouns = sum(1 for word in cleaned_tokens if word in {'i', 'we', 'my', 'ours', 'us'})

        average_word_length = sum(len(word) for word in cleaned_tokens) / num_words if num_words > 0 else 0

        output_data.append([
            url_id,
            row['URL'],
            num_words,
            num_sentences,
            average_sentence_length,
            percentage_complex_words,
            fog_index,
            average_words_per_sentence,
            complex_word_count,
            num_words,
            syllable_per_word,
            personal_pronouns,
            average_word_length
        ])
    else:
        output_data.append([url_id] + [row['URL']] + [float('nan')] * 11)


In [None]:
df_output = pd.DataFrame(output_data, columns=[
    'URL_ID',
    'URL',
    'WORD COUNT',
    'SENTENCE COUNT',
    'AVG SENTENCE LENGTH',
    'PERCENTAGE OF COMPLEX WORDS',
    'FOG INDEX',
    'AVG NUMBER OF WORDS PER SENTENCE',
    'COMPLEX WORD COUNT',
    'WORD COUNT',
    'SYLLABLE PER WORD',
    'PERSONAL PRONOUNS',
    'AVG WORD LENGTH'
])

output_filename = 'Output.xlsx'
df_output.to_excel(output_filename, index=False)
