In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

def extract_article(url):
    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract the article text
        article_text = ''
        for paragraph in soup.find_all('p'):
            article_text += paragraph.text.strip() + ' '

        return article_text
    else:
        print(f"Failed to retrieve content from {url}. Status code: {response.status_code}")
        return None

def process_articles(input_excel):
    # Read the input Excel file
    df = pd.read_excel(input_excel)

    # Create a directory to store the text files
    output_directory = 'extracted_articles'
    os.makedirs(output_directory, exist_ok=True)

    # Process each row in the DataFrame
    for index, row in df.iterrows():
        url_id = row['URL_ID']
        url = row['URL']

        # Extract the article text
        article_text = extract_article(url)

        if article_text:
            # Save the article text to a text file
            output_file_path = os.path.join(output_directory, f'{url_id}.txt')
            with open(output_file_path, 'w', encoding='utf-8') as file:
                file.write(article_text)
            print(f"Article {url_id} saved to {output_file_path}")

# Example usage
input_excel = 'input.xlsx'
process_articles(input_excel)


Article blackassign0001 saved to extracted_articles\blackassign0001.txt
Article blackassign0002 saved to extracted_articles\blackassign0002.txt
Article blackassign0003 saved to extracted_articles\blackassign0003.txt
Article blackassign0004 saved to extracted_articles\blackassign0004.txt
Article blackassign0005 saved to extracted_articles\blackassign0005.txt
Article blackassign0006 saved to extracted_articles\blackassign0006.txt
Article blackassign0007 saved to extracted_articles\blackassign0007.txt
Article blackassign0008 saved to extracted_articles\blackassign0008.txt
Article blackassign0009 saved to extracted_articles\blackassign0009.txt
Article blackassign0010 saved to extracted_articles\blackassign0010.txt
Article blackassign0011 saved to extracted_articles\blackassign0011.txt
Article blackassign0012 saved to extracted_articles\blackassign0012.txt
Article blackassign0013 saved to extracted_articles\blackassign0013.txt
Article blackassign0014 saved to extracted_articles\blackassign0

In [31]:
import nltk
import re
import pandas as pd
import chardet
from nltk import FreqDist
from nltk.tokenize import word_tokenize, sent_tokenize


nltk.download('punkt')

def read_word_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        words = file.read().splitlines()
    return set(words)


def count_syllables(word):
    # Define a regular expression to match vowels
    vowels = re.compile(r'[aeiouy]')

    # Handle exceptions for words ending with "es" or "ed"
    if word.endswith(('es', 'ed')):
        return 1

    # Count the number of vowels in the word
    syllables = len(vowels.findall(word.lower()))

    return max(1, syllables)  # Ensure at least one syllable

# Load Positive and Negative Dictionaries
positive_words = read_word_file('positive-words.txt')
negative_words = read_word_file('utf-8-negative-words.txt')

# Load Stopwords
stopwords_auditor = read_word_file('StopWords_Auditor.txt')
stopwords_currencies = read_word_file('utf-8-StopWords_Currencies.txt')
stopwords_date_and_numbers = read_word_file('StopWords_DatesandNumbers.txt')
stopwords_generic = read_word_file('StopWords_Generic.txt')
stopwords_generic_long = read_word_file('StopWords_GenericLong.txt')
stopwords_geographic = read_word_file('StopWords_Geographic.txt')
stopwords_names = read_word_file('StopWords_Names.txt')



def combine_stopwords(*stopword_lists):
    return set(word for stopword_list in stopword_lists for word in stopword_list)

all_stopwords = combine_stopwords(
    stopwords_auditor,
    stopwords_currencies,
    stopwords_date_and_numbers,
    stopwords_generic,
    stopwords_generic_long,
    stopwords_geographic,
    stopwords_names
)



def perform_textual_analysis(text):

    # Tokenize the text into words
     words = word_tokenize(text)

    # Remove custom stopwords
     filtered_words = [word.lower() for word in words if word.isalnum() and word.lower() not in all_stopwords]

    # Calculate word frequency
     word_freq = FreqDist(filtered_words)
    # Tokenize the text into words
     words = word_tokenize(text)

    # Remove custom stopwords
     filtered_words = [word.lower() for word in words if word.isalnum() and word.lower() not in all_stopwords]

    # Calculate word frequency
     word_freq = FreqDist(filtered_words)

    # Calculate the number of sentences
     sentences = sent_tokenize(text)
     num_sentences = len(sentences)

    # Additional variables for textual analysis
     avg_sentence_length = sum(len(sent.split()) for sent in sentences) / num_sentences
     percentage_of_complex_words = len([word for word in filtered_words if len(word) > 2]) / len(filtered_words) * 100
     fog_index = 0.4 * (avg_sentence_length + percentage_of_complex_words)

     complex_word_count = len([word for word in filtered_words if len(word) > 2])
     word_count = len(words)

    # Syllables per word and avg word length are not computed in this example
     syllables_per_word = sum(count_syllables(word) for word in filtered_words) / len(filtered_words) if filtered_words else 0
     avg_word_length = sum(len(word) for word in filtered_words) / len(filtered_words) if filtered_words else 0

     avg_words_per_sentence = word_count / num_sentences if num_sentences != 0 else 0
    # Personal pronouns count
     personal_pronouns = sum(word.lower() in ['i', 'me', 'my', 'mine', 'myself', 'you', 'your', 'yours', 'yourself',
                                            'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself',
                                            'it', 'its', 'itself', 'we', 'us', 'our', 'ours', 'ourselves',
                                            'they', 'them', 'their', 'theirs', 'themselves'] for word in words)

    # Positive Score, Negative Score, Polarity Score, Subjectivity Score calculation
     positive_score = sum(1 for word in filtered_words if word in positive_words)
     negative_score = sum(1 for word in filtered_words if word in negative_words)

    # Avoid division by zero
     total_words_after_cleaning = len(filtered_words) or 1

     polarity_score = (positive_score - negative_score) / (positive_score + negative_score + 1e-6)
     subjectivity_score = (positive_score + negative_score) / total_words_after_cleaning

     return {
        'Positive Score': positive_score,
        'Negative Score': negative_score,
        'Polarity Score': polarity_score,
        'Subjectivity Score': subjectivity_score,
        'Avg Sentence Length': avg_sentence_length,
        'Percentage of Complex Words': percentage_of_complex_words,
        'Fog Index': fog_index,
        'Avg Words per Sentence': avg_words_per_sentence,
        'Complex Word Count': complex_word_count,
        'Word Count': word_count,
        'Syllables per Word': syllables_per_word,
        'Personal Pronouns': personal_pronouns,
        'Avg Word Length': avg_word_length,
     }



def process_articles_and_save_results(input_excel, output_excel):
    # Read the input Excel file
    df = pd.read_excel(input_excel)

    # Create an empty DataFrame to store results
    result_columns = [
        'URL_ID',
        'URL',
        'Positive Score',
        'Negative Score',
        'Polarity Score',
        'Subjectivity Score',
        'Avg Sentence Length',
        'Percentage of Complex Words',
        'Fog Index',
        'Avg Words per Sentence',
        'Complex Word Count',
        'Word Count',
        'Syllables per Word',
        'Personal Pronouns',
        'Avg Word Length',
    ]
    result_df = pd.DataFrame(columns=result_columns)

    # Process each row in the DataFrame
    for index, row in df.iterrows():
        url_id = row['URL_ID']
        file_path = f'extracted_articles/{url_id}.txt'  # Update the path accordingly

        try:
            # Read the article text from the file
            with open(file_path, 'r', encoding='utf-8') as file:
                article_text = file.read()

            # Perform textual analysis
            analysis_result = perform_textual_analysis(article_text)

            analysis_result['URL_ID'] = url_id
            analysis_result['URL'] = row['URL']  # Assuming 'URL' is the column name in your input DataFrame

            # Append the results to the DataFrame
            result_df = pd.concat([result_df, pd.DataFrame([analysis_result], columns=result_columns)], ignore_index=True)

        except FileNotFoundError:
            print(f"File not found for URL_ID {url_id}. Skipping.")

    # Save the DataFrame to an Excel file
    result_df.to_excel(output_excel, index=False)

input_excel = 'input.xlsx'
output_excel = 'Output Data Structure.xlsx'
process_articles_and_save_results(input_excel, output_excel)


[nltk_data] Downloading package punkt to C:\Users\Abhinandan
[nltk_data]     Patra\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  result_df = pd.concat([result_df, pd.DataFrame([analysis_result], columns=result_columns)], ignore_index=True)


File not found for URL_ID blackassign0036. Skipping.
File not found for URL_ID blackassign0049. Skipping.
