In [1]:
#importing necessery libraries 
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os
from textblob import TextBlob
import syllables

In [2]:
input_file = 'input.xlsx'
output_folder = 'extracted_texts'

In [3]:
#checking if a directory or file exists at the path
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

df = pd.read_excel(input_file)

# Function to extract article text from URL
def extract_article_text(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        
        article = soup.find('div', class_='article-body')  
        
        # Extracting title and content
        title = soup.find('h1').text.strip() if soup.find('h1') else "No Title Found"
        content = article.text.strip() if article else "No Content Found"
        
        return title, content
    except Exception as e:
        print(f"Error processing URL: {url}. Error: {e}")
        return "Error", "Error"

# Loop through URLs and extract article text
for index, row in df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    
    title, content = extract_article_text(url)
    
    # Saving extracted text into a text file
    file_name = f"{url_id}.txt"
    file_path = os.path.join(output_folder, file_name)
    
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(f"Title: {title}\n\n")
        file.write(content)

print("Extraction completed. Text files saved in 'extracted_texts' folder.")

Extraction completed. Text files saved in 'extracted_texts' folder.


In [5]:
def perform_text_analysis(text):
    blob = TextBlob(text)
    word_count = len(blob.words)
    sentence_count = len(blob.sentences)
    avg_sentence_length = word_count / sentence_count
    complex_words = [word for word in blob.words if syllables.estimate(word) > 2]
    complex_word_count = len(complex_words)
    percentage_complex_words = (complex_word_count / word_count) * 100
    syllable_count = sum(syllables.estimate(word) for word in blob.words)
    avg_syllables_per_word = syllable_count / word_count
    personal_pronouns = sum(1 for word, pos in blob.tags if pos == 'PRP')

    # Computing polarity and subjectivity
    polarity = blob.sentiment.polarity
    subjectivity = blob.sentiment.subjectivity
    
    # Calculating FOG INDEX
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)

    # Calculating AVG NUMBER OF WORDS PER SENTENCE
    avg_words_per_sentence = word_count / sentence_count

    # Calculating AVG WORD LENGTH
    avg_word_length = syllable_count / word_count
    
    return {
        'POSITIVE SCORE': max(polarity, 0),
        'NEGATIVE SCORE': abs(min(polarity, 0)),
        'POLARITY SCORE': polarity,
        'SUBJECTIVITY SCORE': subjectivity,
        'AVG SENTENCE LENGTH': avg_sentence_length,
        'PERCENTAGE OF COMPLEX WORDS': percentage_complex_words,
        'FOG INDEX': fog_index,
        'AVG NUMBER OF WORDS PER SENTENCE': avg_words_per_sentence,
        'COMPLEX WORD COUNT': complex_word_count,
        'WORD COUNT': word_count,
        'SYLLABLE PER WORD': avg_syllables_per_word,
        'PERSONAL PRONOUNS': personal_pronouns,
        'AVG WORD LENGTH': avg_word_length,
    }


results = []
for index, row in df.iterrows():
    url_id = row['URL_ID']
    file_name = f"{url_id}.txt"
    file_path = os.path.join(output_folder, file_name)
    
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        analysis = perform_text_analysis(text)
        results.append(analysis)

# Converting results to DataFrame
analysis_df = pd.DataFrame(results)

# Merging analysis data with the original data
output_data = pd.concat([df, analysis_df], axis=1)

# Saving output to Excel 
output_data.to_excel('Output.xlsx', index=False)  # For Excel format
# output_data.to_csv('Output.csv', index=False)  # For CSV format

print("Output file created as 'Output.xlsx'")

Output file created as 'Output.xlsx'
