In [1]:
import requests
import pandas as pd
import nltk
import re
import os
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import cmudict
from nltk import pos_tag
from bs4 import BeautifulSoup 
import time
import os
from urllib.parse import urljoin, urlparse

In [2]:
df=pd.read_excel('Input.xlsx')

In [3]:
output_dir = 'extracted_articles'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    print(f"Created directory: {output_dir}")


headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Accept-Encoding': 'gzip, deflate',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1'
}


In [4]:
def is_valid_paragraph(tag):
    text = tag.get_text(strip=True).lower()
    return (
        tag.name not in ['a', 'img'] and
        'http' not in text and
        'www' not in text and
        'contact' not in text and
        'email' not in text and
        'phone' not in text
    )

In [5]:
for index, row in df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    
    print(f"\nProcessing {index+1}/{len(df)}: {url_id}")
    print(f"URL: {url}")
    
    title = ""
    article_text = ""
    try:
        response = requests.get(url, headers=headers, timeout=30)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content.decode('utf-8', 'ignore'), 'html.parser')
        
        title_container = soup.find('div', class_='td-full-screen-header-image-wrap')
        if title_container:
            title_tag = title_container.find('h1', class_='entry-title')
            if title_tag:
                title = title_tag.get_text().strip()
                
            else:
                print("Title container found but no h1.entry-title inside")
        else:
            print("No td-full-screen-header-image-wrap found, trying fallback...")
            title_tag = soup.find('h1', class_='entry-title')
            if title_tag:
                title = title_tag.get_text().strip()
                
            else:
                print("No entry-title found.")
    
    
        
        content_div = soup.find('div', class_='td-post-content tagdiv-type')
        if content_div:
            text_chunks = []
            for tag in content_div.find_all(['p', 'div', 'span', 'li', 'h1']):
                if is_valid_paragraph(tag):
                    clean_text = tag.get_text(strip=True)
                    if clean_text:
                        text_chunks.append(clean_text)
            article_text = "\n".join(text_chunks)
            
        else:
            print("Content div not found.")
            article_text = "NO_CONTENT"
            
            
        # Combine title and content
        full_text = f"Title: {title}\n\n{article_text}"

        # Save to file
        filename = f"{url_id}.txt"
        filepath = os.path.join(output_dir, filename)
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(full_text)

        print(f" Successfully extracted and saved to {filename}")
        print(f"  Title: {title[:100]}...")
        print(f"  Content length: {len(article_text)} characters")
        
    except requests.exceptions.RequestException as e:
        print(f"✗ Request failed: {e}")
        error_filename = f"{url_id}_ERROR.txt"
        error_filepath = os.path.join(output_dir, error_filename)
        with open(error_filepath, 'w', encoding='utf-8') as f:
            f.write(f"Error extracting content from {url}\nError: {str(e)}")
            
    except Exception as e:
        print(f"✗ Unexpected error: {e}")
        error_filename = f"{url_id}_ERROR.txt"
        error_filepath = os.path.join(output_dir, error_filename)
        with open(error_filepath, 'w', encoding='utf-8') as f:
            f.write(f"Error extracting content from {url}\nError: {str(e)}")

    
    time.sleep(2)

print(f"\n Processing complete! Check the '{output_dir}' directory for extracted articles.")
print(f"Total files processed: {len(df)}")



Processing 1/147: Netclan20241017
URL: https://insights.blackcoffer.com/ai-and-ml-based-youtube-analytics-and-content-creation-tool-for-optimizing-subscriber-engagement-and-content-strategy/
 Successfully extracted and saved to Netclan20241017.txt
  Title: AI and ML-Based YouTube Analytics and Content Creation Tool for Optimizing Subscriber Engagement and...
  Content length: 1335 characters

Processing 2/147: Netclan20241018
URL: https://insights.blackcoffer.com/enhancing-front-end-features-and-functionality-for-improved-user-experience-and-dashboard-accuracy-in-partner-hospital-application/
 Successfully extracted and saved to Netclan20241018.txt
  Title: Enhancing Front-End Features and Functionality for Improved User Experience and Dashboard Accuracy i...
  Content length: 4505 characters

Processing 3/147: Netclan20241019
URL: https://insights.blackcoffer.com/roas-dashboard-for-campaign-wise-google-ads-budget-tracking-using-google-ads-ap/
 Successfully extracted and saved to Netc

In [6]:
stopwords_dir = r"C:\Users\Dell\Documents\News Summarization and Text-to-Speech Application\Blackcoffer\StopWords-20250801T045656Z-1-001\StopWords"
textdir = r"C:\Users\Dell\Documents\News Summarization and Text-to-Speech Application\Blackcoffer\extracted_articles"

In [7]:
stop_words = set()

print(f"\n Loading stopwords from directory: {stopwords_dir}\n")

for file in os.listdir(stopwords_dir):
    if file.endswith('.txt'):
        file_path = os.path.join(stopwords_dir, file)
        try:
            print(f"Loading from: {file}")
            with open(file_path, 'r', encoding='latin-1') as f:
                for line in f:
                    # Split on spaces and pipes
                    words = re.split(r'\s+|\|', line.strip())
                    stop_words.update([w.lower() for w in words if w.strip()])
        except Exception as e:
            print(f" Error loading stopwords from {file}: {e}")

print(f"\n Total unique stopwords loaded: {len(stop_words)}")



 Loading stopwords from directory: C:\Users\Dell\Documents\News Summarization and Text-to-Speech Application\Blackcoffer\StopWords-20250801T045656Z-1-001\StopWords

Loading from: StopWords_Auditor.txt
Loading from: StopWords_Currencies.txt
Loading from: StopWords_DatesandNumbers.txt
Loading from: StopWords_Generic.txt
Loading from: StopWords_GenericLong.txt
Loading from: StopWords_Geographic.txt
Loading from: StopWords_Names.txt

 Total unique stopwords loaded: 12839


In [10]:
positive_words_path = r"C:\Users\Dell\Documents\News Summarization and Text-to-Speech Application\Blackcoffer\MasterDictionary-20250801T045656Z-1-001/MasterDictionary\positive-words.txt"
negative_words_path = r"C:\Users\Dell\Documents\News Summarization and Text-to-Speech Application\Blackcoffer\MasterDictionary-20250801T045656Z-1-001/MasterDictionary\negative-words.txt"

positive_words = set()
negative_words = set()

# Load positive words (excluding stopwords)
try:
    with open(positive_words_path, 'r', encoding='latin-1') as f:
        for line in f:
            word = line.strip().lower()
            if word and word not in stop_words:
                positive_words.add(word)
    print(f"\n Loaded {len(positive_words)} positive words")
except Exception as e:
    print(f" Error loading positive words: {e}")

# Load negative words (excluding stopwords)
try:
    with open(negative_words_path, 'r', encoding='latin-1') as f:
        for line in f:
            word = line.strip().lower()
            if word and word not in stop_words:
                negative_words.add(word)
    print(f" Loaded {len(negative_words)} negative words")
except Exception as e:
    print(f"Error loading negative words: {e}")



 Loaded 1906 positive words
 Loaded 4693 negative words


In [11]:
pron_dict = cmudict.dict()
def count_syllables(word):
    word = word.lower()
    if word in pron_dict:
        return len([ph for ph in pron_dict[word][0] if ph[-1].isdigit()])
    else:
        return 1 

In [12]:
docs = []
results = []
sent = []

print(f"\nProcessing text files in: {textdir}\n")
for text_file in os.listdir(textdir):
    if text_file.endswith('.txt') and not text_file.endswith('_ERROR.txt'):
        file_path = os.path.join(textdir, text_file)
        cleaned_file_name = re.sub(r'[^\w\s.-]', '', text_file)
        try:
            with open(file_path, 'r', encoding='latin-1') as f:
                text = f.read()
                # Tokenize
                sentences = sent_tokenize(text)
                words = word_tokenize(text)
                # Lowercase and filter
                filtered_text = [word.lower() for word in words if word.lower() not in stop_words and word.isalnum()]
                docs.append(filtered_text)
                sent.append(sentences)
                
                total_words = len(filtered_text)
                total_sentences = len(sent)
                
                # Positive/Negative scoring
                pos_score = sum(1 for word in filtered_text if word in positive_words)
                neg_score = sum(1 for word in filtered_text if word in negative_words)
                polarity_score = (pos_score - neg_score) / ((pos_score + neg_score) + 0.000001)
                subjectivity_score = (pos_score + neg_score) / (total_words + 0.000001)
                
                Average_Sentence_Length = total_words/total_sentences
                
                # Complex Words (more than 2 syllables)
                complex_word_count = sum(1 for word in filtered_text if count_syllables(word) > 2)
                Percentage_of_Complex_words = complex_word_count / total_words
                
                # Fog Index
                fog_index = 0.4*(Average_Sentence_Length + Percentage_of_Complex_words)
                
                # Syllable Count Per Word (average)
                total_syllables = sum(count_syllables(word) for word in filtered_text)
                syllable_per_word = total_syllables / total_words
                
                # Personal pronouns via POS tagging
                tagged_words = pos_tag(word_tokenize(text))
                personal_pronouns = sum(
                    1 for word, tag in tagged_words 
                    if tag in ['PRP', 'PRP$'] and word.lower() in ['i', 'we', 'my', 'ours', 'us']
                )
                
                # Avg word length
                total_chars = sum(len(word) for word in filtered_text)
                avg_word_length = total_chars / total_words
                
                results.append({
                    "FILENAME": text_file.replace(".txt", ""),
                    "POSITIVE SCORE": pos_score,
                    "NEGATIVE SCORE": neg_score,
                    "POLARITY SCORE": round(polarity_score, 4),
                    "SUBJECTIVITY SCORE": round(subjectivity_score, 4),
                    "AVG SENTENCE LENGTH": round(Average_Sentence_Length, 4),
                    "PERCENTAGE OF COMPLEX WORDS": round(Percentage_of_Complex_words, 4),
                    "FOG INDEX" : round(fog_index, 4),
                    "AVG NUMBER OF WORDS PER SENTENCE" : round(Average_Sentence_Length, 4),
                    "COMPLEX WORD COUNT": complex_word_count,
                    "WORD COUNT" : total_words,
                    "SYLLABLE PER WORD" : round(syllable_per_word,4),
                    "PERSONAL PRONOUNS" : personal_pronouns,
                    "AVG WORD LENGTH" : round(avg_word_length,4)
                    
                })

                print(f"{text_file}: Pos={pos_score}, Neg={neg_score}, Polarity={polarity_score:.4f}, Subjectivity={subjectivity_score:.4f}")
        except Exception as e:
            print(f" Error processing file {cleaned_file_name}: {e}")

df_scores = pd.DataFrame(results)


Processing text files in: C:\Users\Dell\Documents\News Summarization and Text-to-Speech Application\Blackcoffer\extracted_articles

Netclan20241017.txt: Pos=5, Neg=1, Polarity=0.6667, Subjectivity=0.0469
Netclan20241018.txt: Pos=9, Neg=4, Polarity=0.3846, Subjectivity=0.0394
Netclan20241019.txt: Pos=11, Neg=4, Polarity=0.4667, Subjectivity=0.0593
Netclan20241020.txt: Pos=23, Neg=12, Polarity=0.3143, Subjectivity=0.0926
Netclan20241021.txt: Pos=15, Neg=3, Polarity=0.6667, Subjectivity=0.0296
Netclan20241022.txt: Pos=5, Neg=1, Polarity=0.6667, Subjectivity=0.0469
Netclan20241023.txt: Pos=9, Neg=4, Polarity=0.3846, Subjectivity=0.0394
Netclan20241024.txt: Pos=11, Neg=4, Polarity=0.4667, Subjectivity=0.0593
Netclan20241025.txt: Pos=23, Neg=12, Polarity=0.3143, Subjectivity=0.0926
Netclan20241026.txt: Pos=56, Neg=22, Polarity=0.4359, Subjectivity=0.0709
Netclan20241027.txt: Pos=18, Neg=7, Polarity=0.4400, Subjectivity=0.0828
Netclan20241028.txt: Pos=27, Neg=3, Polarity=0.8000, Subjectivity

In [13]:
output_df = pd.read_excel("Output Data Structure.xlsx")

df_scores["FILENAME"] = df_scores["FILENAME"].astype(str).str.strip()
output_df["URL_ID"] = output_df["URL_ID"].astype(str).str.strip()

df_scores.set_index("FILENAME", inplace=True)

for col in [
    "POSITIVE SCORE", "NEGATIVE SCORE", "POLARITY SCORE", "SUBJECTIVITY SCORE",
    "AVG SENTENCE LENGTH", "PERCENTAGE OF COMPLEX WORDS", "FOG INDEX",
    "AVG NUMBER OF WORDS PER SENTENCE", "COMPLEX WORD COUNT", "WORD COUNT",
    "SYLLABLE PER WORD", "PERSONAL PRONOUNS", "AVG WORD LENGTH"
]:
    output_df[col] = output_df["URL_ID"].map(df_scores[col])
    
output_df.to_excel("Output Data Structure.xlsx", index=False)
print("\n Scores added to original Output Data Structure.xlsx")


 Scores added to original Output Data Structure.xlsx
