In [1]:
import pandas as pd

input_df = pd.read_excel('Input.xlsx')
url_data = input_df[['URL_ID', 'URL']]

In [2]:
import requests
from bs4 import BeautifulSoup
import os

def extract_text(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    title = soup.find('title').text.strip()
    
    paragraphs = soup.find_all('p')
    text = ' '.join([p.text.strip() for p in paragraphs])

    return title, text

output_dir = 'extracted_articles'
os.makedirs(output_dir, exist_ok=True)

for _, row in url_data.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    
    title, text = extract_text(url)
    filename = f'{output_dir}/{url_id}.txt'
    
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(title + '\n' + text)

In [4]:
!pip install textblob

Collecting textblob
  Downloading textblob-0.18.0.post0-py3-none-any.whl (626 kB)
Collecting nltk>=3.8
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
Installing collected packages: nltk, textblob
  Attempting uninstall: nltk
    Found existing installation: nltk 3.7
    Uninstalling nltk-3.7:
      Successfully uninstalled nltk-3.7
Successfully installed nltk-3.8.1 textblob-0.18.0.post0


In [7]:
import re
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from textblob import TextBlob

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

def compute_sentiment_scores(text):
    sia = SentimentIntensityAnalyzer()
    sentiment = sia.polarity_scores(text)
    positive_score = sentiment['pos']
    negative_score = sentiment['neg']
    polarity_score = sentiment['compound']
    return positive_score, negative_score, polarity_score

def compute_subjectivity_score(text):
    blob = TextBlob(text)
    subjectivity_score = blob.sentiment.subjectivity
    return subjectivity_score

def compute_readability_metrics(text):
    words = nltk.word_tokenize(text)
    sentences = nltk.sent_tokenize(text)
    word_count = len(words)
    sentence_count = len(sentences)
    avg_sentence_length = word_count / sentence_count if sentence_count else 0
    syllables_per_word = sum(len(re.findall(r'[aeiouy]', word.lower())) for word in words) / word_count if word_count else 0
    return avg_sentence_length, syllables_per_word

def compute_other_metrics(text):
    personal_pronouns = len(re.findall(r'\b(I|we|my|ours|us)\b', text, re.I))
    avg_word_length = sum(len(word) for word in nltk.word_tokenize(text)) / len(nltk.word_tokenize(text))
    return personal_pronouns, avg_word_length

def text_analysis(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        text = file.read()
    
    positive_score, negative_score, polarity_score = compute_sentiment_scores(text)
    subjectivity_score = compute_subjectivity_score(text)
    avg_sentence_length, syllables_per_word = compute_readability_metrics(text)
    personal_pronouns, avg_word_length = compute_other_metrics(text)


    metrics = {
        'positive_score': positive_score,
        'negative_score': negative_score,
        'polarity_score': polarity_score,
        'subjectivity_score': subjectivity_score,
        'avg_sentence_length': avg_sentence_length,
        'syllables_per_word': syllables_per_word,
        'personal_pronouns': personal_pronouns,
        'avg_word_length': avg_word_length,
    
    }
    return metrics



[nltk_data] Downloading package punkt to C:\Users\Aditya
[nltk_data]     Kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Aditya Kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [12]:
output_data = []

for _, row in url_data.iterrows():
    url_id = row['URL_ID']
    filename = f'{output_dir}/{url_id}.txt'
    
    metrics = text_analysis(filename)
    result = {'URL_ID': url_id}
    result.update(metrics)
    
    output_data.append(result)


output_df = pd.DataFrame(output_data)
output_df.to_excel('Output Data Structure.xlsx', index=False)


In [9]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to C:\Users\Aditya
[nltk_data]     Kumar\AppData\Roaming\nltk_data...


True

In [None]:
# Approach to the Solution
# 1) We used the pandas library to read the URLs from Input.xlsx.
# 2) Utilized the requests library to fetch the HTML content from the URLs.
# 3) Used the BeautifulSoup library to parse the HTML and extract the article title and text.
# 4) Saved the extracted text into text files named with the URL_ID.
# 5) Calculated various text metrics such as positive score, negative score, polarity score, subjectivity score, average sentence length, percentage of complex words, fog index, average number of words per sentence, complex word count, word count, syllables per word, personal pronouns, and average word length using nltk, TextBlob, and other string manipulation techniques.
# 6) Stored the analysis results in an Excel file (Output Data Structure.xlsx) using pandas.

# Dependencies Required
# 1) pandas: For reading and writing Excel files.
# 2) requests: For sending HTTP requests to fetch web pages.
# 3) beautifulsoup4: For parsing HTML and extracting article content.
# 4) nltk: For natural language processing tasks like tokenization and sentiment analysis.
# 5) textblob: For computing subjectivity scores.
# 6) openpyxl: For reading and writing Excel files (installed with pandas).