In [3]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os

def extract_article(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Assuming the article title is in <h1> tag and the text is within <p> tags
    title = soup.find('h1').get_text(strip=True)
    paragraphs = soup.find_all('p')
    article_text = ' '.join([para.get_text(strip=True) for para in paragraphs])

    return title, article_text

def save_article_text(url_id, title, text):
    with open(f'{url_id}.txt', 'w', encoding='utf-8') as file:
        file.write(f'{title}\n\n{text}')

# Load URLs from the Excel file
input_file = r'C:\Users\Sparx\Downloads\DataEngg\Input.xlsx'
df = pd.read_excel(input_file)

for index, row in df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    try:
        title, text = extract_article(url)
        save_article_text(url_id, title, text)
        print(f'Successfully extracted and saved article {url_id}')
    except Exception as e:
        print(f'Failed to extract article {url_id} from {url}: {e}')


Successfully extracted and saved article blackassign0001
Successfully extracted and saved article blackassign0002
Successfully extracted and saved article blackassign0003
Successfully extracted and saved article blackassign0004
Successfully extracted and saved article blackassign0005
Successfully extracted and saved article blackassign0006
Successfully extracted and saved article blackassign0007
Successfully extracted and saved article blackassign0008
Successfully extracted and saved article blackassign0009
Successfully extracted and saved article blackassign0010
Successfully extracted and saved article blackassign0011
Successfully extracted and saved article blackassign0012
Successfully extracted and saved article blackassign0013
Successfully extracted and saved article blackassign0014
Successfully extracted and saved article blackassign0015
Successfully extracted and saved article blackassign0016
Successfully extracted and saved article blackassign0017
Successfully extracted and save

In [11]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import Counter

nltk.download('punkt')
nltk.download('stopwords')

def load_words(filename):
    with open(filename, 'r', encoding='latin-1') as file:
        return set(file.read().split())

# Load positive and negative words from local text files
negative_words = load_words('C:/Users/Sparx/Downloads/DataEngg/MasterDictionary-20240701T111816Z-001/MasterDictionary/negative-words.txt')
positive_words = load_words('C:/Users/Sparx/Downloads/DataEngg/MasterDictionary-20240701T111816Z-001/MasterDictionary/positive-words.txt')

def text_analysis(text):
    # Compute word tokens
    words = word_tokenize(text.lower())
    num_words = len(words)

    # Compute positive and negative scores
    positive_score = sum(1 for word in words if word in positive_words)
    negative_score = sum(1 for word in words if word in negative_words)

    # Compute polarity and subjectivity scores
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (num_words + 0.000001)

    # Sentence analysis
    sentences = sent_tokenize(text)
    num_sentences = len(sentences)
    avg_sentence_length = num_words / num_sentences

    # Compute complex words and Fog Index
    complex_words = [word for word in words if len(word) > 2 and sum(1 for char in word if char in 'aeiou') > 2]
    num_complex_words = len(complex_words)
    percentage_complex_words = num_complex_words / num_words
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)

    # Additional metrics
    avg_words_per_sentence = num_words / num_sentences
    syllable_per_word = sum([len(re.findall(r'[aeiouy]+', word)) for word in words]) / num_words
    personal_pronouns = len(re.findall(r'\b(I|we|my|ours|us)\b', text, re.IGNORECASE))
    avg_word_length = sum(len(word) for word in words) / num_words

    return {
        "positive_score": positive_score,
        "negative_score": negative_score,
        "polarity_score": polarity_score,
        "subjectivity_score": subjectivity_score,
        "avg_sentence_length": avg_sentence_length,
        "percentage_complex_words": percentage_complex_words,
        "fog_index": fog_index,
        "avg_words_per_sentence": avg_words_per_sentence,
        "complex_word_count": num_complex_words,
        "word_count": num_words,
        "syllable_per_word": syllable_per_word,
        "personal_pronouns": personal_pronouns,
        "avg_word_length": avg_word_length,
    }

def save_analysis_results(url_id, analysis):
    for key, value in analysis.items():
        with open(f'{url_id}_{key}.txt', 'w', encoding='utf-8') as file:
            file.write(str(value))

# Load input and prepare output DataFrame
input_file = r'C:\Users\Sparx\Downloads\DataEngg\Input.xlsx'
df = pd.read_excel(input_file)
results = []

for index, row in df.iterrows():
    url_id = row['URL_ID']
    try:
        with open(f'{url_id}.txt', 'r', encoding='utf-8') as file:
            text = file.read()
        analysis = text_analysis(text)
        save_analysis_results(url_id, analysis)
        results.append({**row, **analysis})
        print(f'Analysis completed for article {url_id}')
    except Exception as e:
        print(f'Failed to analyze article {url_id}: {e}')

output_df = pd.DataFrame(results)
output_df.to_excel(r'C:\Users\Sparx\Downloads\DataEngg\Output_Data_Structure.xlsx', index=False)
print('Analysis results saved to Output_Data_Structure.xlsx')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Sparx\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sparx\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Analysis completed for article blackassign0001
Analysis completed for article blackassign0002
Analysis completed for article blackassign0003
Analysis completed for article blackassign0004
Analysis completed for article blackassign0005
Analysis completed for article blackassign0006
Analysis completed for article blackassign0007
Analysis completed for article blackassign0008
Analysis completed for article blackassign0009
Analysis completed for article blackassign0010
Analysis completed for article blackassign0011
Analysis completed for article blackassign0012
Analysis completed for article blackassign0013
Analysis completed for article blackassign0014
Analysis completed for article blackassign0015
Analysis completed for article blackassign0016
Analysis completed for article blackassign0017
Analysis completed for article blackassign0018
Analysis completed for article blackassign0019
Analysis completed for article blackassign0020
Analysis completed for article blackassign0021
Analysis comp