In [11]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import re


In [None]:


# Load the input file
input_file_path = 'Input.xlsx'  # Update this path to your local file
input_df = pd.read_excel(input_file_path)

# Function to extract article text from specific classes
def extract_article_text(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Extract title with exact class 'entry-title'
    title_element = soup.find(lambda tag: tag.get('class') == ['entry-title'])
    title = title_element.get_text() if title_element else ''
    
    # Extract content with exact class combination 'td-post-content tagdiv-type'
    content_element = soup.find(lambda tag: tag.get('class') == ['td-post-content', 'tagdiv-type'])
    content = content_element.get_text() if content_element else ''
    
    return title, content

# Directory to save the extracted text files
output_text_dir = 'path_to_save_extracted_articles'  # Update this path to your desired directory
os.makedirs(output_text_dir, exist_ok=True)

# Loop through each URL in the input file and extract text
for index, row in input_df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    try:
        title, article_text = extract_article_text(url)
        # Save the article text to a file named with URL_ID
        with open(os.path.join(output_text_dir, f'{url_id}.txt'), 'w', encoding='utf-8') as file:
            file.write(title + '\n' + article_text)
        print(f'Successfully extracted and saved article for URL_ID: {url_id}')
    except Exception as e:
        print(f'Failed to extract article for URL_ID: {url_id}. Error: {e}')


In [1]:
encoding = []
# Load stop words
def load_stop_words(file_path):
    encodings = ['utf-8', 'iso-8859-1', 'latin-1']
    for encoding in encodings:
        try:
            with open(file_path, 'r', encoding=encoding) as file:
                stop_words = file.read().splitlines()
            return stop_words
        except UnicodeDecodeError:
            continue
    raise Exception(f"Failed to decode {file_path} with available encodings.")

stop_words_files = [
    'StopWords/StopWords_Currencies.txt',
    'StopWords/StopWords_DatesandNumbers.txt',
    'StopWords/StopWords_Auditor.txt',
    'StopWords/StopWords_Generic.txt',
    'StopWords/StopWords_GenericLong.txt',
    'StopWords/StopWords_Geographic.txt',
    'StopWords/StopWords_Names.txt'
]

stop_words = []
for file_path in stop_words_files:
    stop_words.extend(load_stop_words(file_path))

# Load positive and negative words
positive_words_file = 'MasterDictionary/positive-words.txt'
negative_words_file = 'MasterDictionary/negative-words.txt'

positive_words = load_stop_words(positive_words_file)
negative_words = load_stop_words(negative_words_file)

stop_words = set(stop_words)
positive_words = set(positive_words)
negative_words = set(negative_words)

# Display the counts of loaded words
len(stop_words), len(positive_words), len(negative_words)


(12919, 2006, 4783)

In [15]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\MUZAMIL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [18]:
# Function to compute text analysis
def compute_text_analysis(article_text):
    # Tokenize sentences and words
    sentences = sent_tokenize(article_text)
    words = word_tokenize(article_text.lower())
    
    # Remove stop words and non-alphanumeric tokens
    words = [word for word in words if word.isalnum() and word not in stop_words]
    
    # Positive, negative, polarity, and subjectivity scores using the provided word lists
    positive_score = sum(1 for word in words if word in positive_words)
    negative_score = sum(1 for word in words if word in negative_words)
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 1e-6)
    subjectivity_score = (positive_score + negative_score) / (len(words) + 1e-6)
    
    # Average sentence length
    avg_sentence_length = sum(len(sentence.split()) for sentence in sentences) / len(sentences)
    
    # Complex words (words with more than two syllables)
    complex_words = [word for word in words if len(re.findall(r'[aeiouyAEIOUY]', word)) > 2]
    complex_word_count = len(complex_words)
    
    # Percentage of complex words
    percentage_of_complex_words = (complex_word_count / len(words)) * 100
    
    # FOG index
    fog_index = 0.4 * (avg_sentence_length + percentage_of_complex_words)
    
    # Average number of words per sentence
    avg_number_of_words_per_sentence = len(words) / len(sentences)
    
    # Word count
    word_count = len(words)
    
    # Syllables per word
    syllable_per_word = sum(len(re.findall(r'[aeiouyAEIOUY]', word)) for word in words) / len(words)
    
    # Personal pronouns count
    personal_pronouns = len(re.findall(r'\b(I|we|my|ours|us)\b', article_text, re.I))
    
    # Average word length
    avg_word_length = sum(len(word) for word in words) / len(words)
    
    return {
        "POSITIVE SCORE": positive_score,
        "NEGATIVE SCORE": negative_score,
        "POLARITY SCORE": polarity_score,
        "SUBJECTIVITY SCORE": subjectivity_score,
        "AVG SENTENCE LENGTH": avg_sentence_length,
        "PERCENTAGE OF COMPLEX WORDS": percentage_of_complex_words,
        "FOG INDEX": fog_index,
        "AVG NUMBER OF WORDS PER SENTENCE": avg_number_of_words_per_sentence,
        "COMPLEX WORD COUNT": complex_word_count,
        "WORD COUNT": word_count,
        "SYLLABLE PER WORD": syllable_per_word,
        "PERSONAL PRONOUNS": personal_pronouns,
        "AVG WORD LENGTH": avg_word_length,
    }

# Process each extracted article and compute the text analysis
extraction_dir = 'path_to_save_extracted_articles'
extracted_files = os.listdir(extraction_dir)

results = []

for file_name in extracted_files:
    file_path = os.path.join(extraction_dir, file_name)
    with open(file_path, 'r', encoding='utf-8') as file:
        article_text = file.read()
    
    analysis_results = compute_text_analysis(article_text)
    url_id = os.path.splitext(file_name)[0]
    analysis_results["URL_ID"] = url_id
    results.append(analysis_results)

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Save the results to an Excel file
output_file_path = 'Output.csv'
results_df.to_csv(output_file_path, index=False)


