In [9]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import re

nltk.download('punkt')
nltk.download('stopwords')

with open('positive-words.txt', 'r',encoding='utf-8',errors='ignore') as file:
    positive_words = set(file.read().split())

with open('negative-words.txt', 'r', encoding='utf-8', errors='ignore') as file:
    negative_words = set(file.read().split())

stop_words = set(stopwords.words('english'))

def load_custom_stop_words(file_path):
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        return set(word.strip().lower() for word in file)

custom_stop_words = set()
custom_stop_words.update(load_custom_stop_words('StopWords_DatesandNumbers.txt'))
custom_stop_words.update(load_custom_stop_words('StopWords_Generic.txt'))
custom_stop_words.update(load_custom_stop_words('StopWords_GenericLong.txt'))
custom_stop_words.update(load_custom_stop_words('StopWords_Geographic.txt'))
custom_stop_words.update(load_custom_stop_words('StopWords_Names.txt'))
custom_stop_words.update(load_custom_stop_words('StopWords_Auditor.txt'))
custom_stop_words.update(load_custom_stop_words('StopWords_Auditor.txt'))
custom_stop_words.update(load_custom_stop_words('StopWords_Currencies.txt'))
all_stop_words = stop_words.union(custom_stop_words)

def extract_article_text(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    title = soup.find('h1').text.strip()
    paragraphs = soup.find_all('p')
    text = ' '.join([p.text for p in paragraphs])
    return title, text

def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    words = word_tokenize(text)
    cleaned_words = [word for word in words if word not in all_stop_words]
    return ' '.join(cleaned_words)

def analyze_sentiment(text):
    words = word_tokenize(text)
    positive_score = sum(1 for word in words if word in positive_words)
    negative_score = sum(1 for word in words if word in negative_words)
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (len(words) + 0.000001)
    return positive_score, negative_score, polarity_score, subjectivity_score

def analyze_readability(text):
    sentences = sent_tokenize(text)
    words = word_tokenize(text)
    avg_sentence_length = len(words) / len(sentences)
    complex_words = [word for word in words if len(word) > 2 and len(set(word)) > 2]
    percentage_complex_words = len(complex_words) / len(words)
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    return avg_sentence_length, percentage_complex_words, fog_index

def count_syllables(word):
    vowels = 'aeiou'
    count = 0
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith('e'):
        count -= 1
    if word.endswith('le'):
        count += 1
    if count == 0:
        count += 1
    return count

def analyze_text(text):
    clean_text_content = clean_text(text)
    words = word_tokenize(clean_text_content)
    sentences = sent_tokenize(text)
    word_count = len(words)
    avg_sentence_length = word_count / len(sentences)
    complex_words = [word for word in words if count_syllables(word) > 2]
    complex_word_count = len(complex_words)
    percentage_complex_words = complex_word_count / word_count
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    syllable_count = sum(count_syllables(word) for word in words)
    syllable_per_word = syllable_count / word_count
    personal_pronouns = len(re.findall(r'\b(I|we|my|ours|us)\b', text, re.IGNORECASE))
    avg_word_length = sum(len(word) for word in words) / word_count
    positive_score, negative_score, polarity_score, subjectivity_score = analyze_sentiment(clean_text_content)
    return {
        'POSITIVE SCORE': positive_score,
        'NEGATIVE SCORE': negative_score,
        'POLARITY SCORE': polarity_score,
        'SUBJECTIVITY SCORE': subjectivity_score,
        'AVG SENTENCE LENGTH': avg_sentence_length,
        'PERCENTAGE OF COMPLEX WORDS': percentage_complex_words,
        'FOG INDEX': fog_index,
        'AVG NUMBER OF WORDS PER SENTENCE': avg_sentence_length,
        'COMPLEX WORD COUNT': complex_word_count,
        'WORD COUNT': word_count,
        'SYLLABLE PER WORD': syllable_per_word,
        'PERSONAL PRONOUNS': personal_pronouns,
        'AVG WORD LENGTH': avg_word_length
    }
input_df = pd.read_excel('Input.xlsx')
output_df = input_df.copy()
for index, row in input_df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']

    try:
        title, text = extract_article_text(url)

        with open(f'{url_id}.txt', 'w', encoding='utf-8') as file:
            file.write(f'{title}\n\n{text}')

        analysis_results = analyze_text(text)

        for key, value in analysis_results.items():
            output_df.at[index, key] = value

    except Exception as e:
        print(f"Error processing {url_id}: {str(e)}")
output_df.to_excel('Output Data Structure.xlsx', index=False)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Analysis complete. Results saved to 'Output Data Structure.xlsx'
