In [1]:
import os
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests

nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/drax/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Extraction

In [2]:
def extract_article(url):
    response = requests.get(url)
    if response.status_code != 200:
        return None, None

    soup = BeautifulSoup(response.content, 'html.parser')
    
    title = soup.find('h1', class_='entry-title')
    if title is None:
        title = soup.find('h1', class_='tdb-title-text')
    
    title_text = title.get_text(strip=True) if title else None

    article_content = soup.find('div', class_='td-post-content')
    if article_content is None:
        article_content = soup.find('div', class_='tdb-block-inner')
    
    article_text = article_content.get_text(strip=True) if article_content else None

    return title_text, article_text

input_file = 'Input.xlsx'
df = pd.read_excel(input_file)

output_dir = 'extracted_articles'
os.makedirs(output_dir, exist_ok=True)

for index, row in df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    print(f'Extracting article from URL ID: {url_id}')
    
    title, article_text = extract_article(url)
    if title and article_text:
        file_path = os.path.join(output_dir, f'{url_id}.txt')
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(title + '\n' + article_text)
    else:
        print(f'Failed to extract article from URL ID: {url_id}')

print('Extraction complete.')


Extracting article from URL ID: blackassign0001
Extracting article from URL ID: blackassign0002
Extracting article from URL ID: blackassign0003
Extraction complete.


Words HashSet

In [3]:
def load_stop_words(path, encoding='latin1'):
    stop_words = set()
    for file in os.listdir(path):
        if file.endswith(".txt"):
            with open(os.path.join(path, file), 'r', encoding=encoding) as f:
                words = f.read().split()
                stop_words.update(words)
    return stop_words


In [4]:
def load_dictionary(file_path, encoding='latin1'):
    dictionary = set()
    with open(file_path, 'r', encoding=encoding) as f:
        words = f.read().split()
        dictionary.update(words)
    return dictionary

Remove special characters, all in lower case and tokenize

In [5]:
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

Score Calculation

In [6]:
positive_words = {'sample', 'longer'}
negative_words = {'complex'}

sample_text = "This is a sample sentence. This is another sample sentence, which is longer and more complex."
sample_tokens = word_tokenize(sample_text)

In [7]:
def calculate_positive_score(tokens):
    return sum(1 for word in tokens if word in positive_words)

In [8]:
positive_score = calculate_positive_score(sample_tokens)
print(positive_score)

3


In [9]:
def calculate_negative_score(tokens):
    return sum(1 for word in tokens if word in negative_words)

In [10]:
negative_score = calculate_negative_score(sample_tokens)
print(negative_score)

1


In [11]:
def calculate_polarity_score(positive_score, negative_score):
    return (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)

In [12]:
polarity_score = calculate_polarity_score(positive_score, negative_score)
print(polarity_score)

0.49999987500003124


In [13]:
def calculate_subjectivity_score(positive_score, negative_score, total_words):
    return (positive_score + negative_score) / (total_words + 0.000001)

In [14]:
total_words = len(sample_tokens)
subjectivity_score = calculate_subjectivity_score(positive_score, negative_score, total_words)
print(subjectivity_score)

0.21052630470914185


In [15]:
def calculate_avg_sentence_length(text):
    sentences = sent_tokenize(text)
    words = word_tokenize(text)
    return len(words) / len(sentences) if sentences else 0

In [19]:
avg_sentence_length = calculate_avg_sentence_length(sample_text)
print(avg_sentence_length)

9.5


In [17]:
def calculate_percentage_complex_words(tokens):
    complex_words = [word for word in tokens if sum(1 for char in word if char in 'aeiou') > 2]
    return len(complex_words) / len(tokens) if tokens else 0

In [20]:
percentage_complex_words = calculate_percentage_complex_words(sample_tokens)
print(percentage_complex_words)

0.15789473684210525


In [21]:
def calculate_fog_index(avg_sentence_length, percentage_complex_words):
    return 0.4 * (avg_sentence_length + percentage_complex_words)

In [22]:
fog_index = calculate_fog_index(avg_sentence_length, percentage_complex_words)
print(fog_index)

3.863157894736842


In [23]:
def calculate_avg_words_per_sentence(text):
    sentences = sent_tokenize(text)
    words = word_tokenize(text)
    return len(words) / len(sentences) if sentences else 0

In [24]:
avg_words_per_sentence = calculate_avg_words_per_sentence(sample_text)
print(avg_words_per_sentence)

9.5


In [25]:
def count_complex_words(tokens):
    return sum(1 for word in tokens if sum(1 for char in word if char in 'aeiou') > 2)

In [26]:
complex_word_count = count_complex_words(sample_tokens)
print(complex_word_count)

3


In [27]:
def count_syllables(word):
    word = word.lower()
    syllables = 0
    vowels = "aeiouy"
    if word[0] in vowels:
        syllables += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            syllables += 1
    if word.endswith("e"):
        syllables -= 1
    if word.endswith("le") and len(word) > 2 and word[-3] not in vowels:
        syllables += 1
    if syllables == 0:
        syllables += 1
    return syllables

In [28]:
sample_word = "complex"
syllable_count = count_syllables(sample_word)
print(syllable_count)

2


In [29]:
def calculate_syllables_per_word(tokens):
    return np.mean([count_syllables(word) for word in tokens])

In [30]:
syllables_per_word = calculate_syllables_per_word(sample_tokens)
print(syllables_per_word)

1.4210526315789473


In [31]:
def count_personal_pronouns(text):
    pronouns = re.findall(r'\b(I|we|my|ours|us)\b', text, re.I)
    return len(pronouns)

In [32]:
personal_pronouns_count = count_personal_pronouns(sample_text)
print(personal_pronouns_count)

0


In [33]:
def calculate_avg_word_length(tokens):
    return np.mean([len(word) for word in tokens])

In [34]:
avg_word_length = calculate_avg_word_length(sample_tokens)
print(avg_word_length)

4.105263157894737


In [35]:
stop_words = load_stop_words('StopWords')
positive_words = load_dictionary('MasterDictionary/positive-words.txt')
negative_words = load_dictionary('MasterDictionary/negative-words.txt')

In [36]:
def analyze_text(title, text):
    cleaned_tokens = clean_text(text)
    total_words = len(cleaned_tokens)
    positive_score = calculate_positive_score(cleaned_tokens)
    negative_score = calculate_negative_score(cleaned_tokens)
    polarity_score = calculate_polarity_score(positive_score, negative_score)
    subjectivity_score = calculate_subjectivity_score(positive_score, negative_score, total_words)
    avg_sentence_length = calculate_avg_sentence_length(text)
    percentage_complex_words = calculate_percentage_complex_words(cleaned_tokens)
    fog_index = calculate_fog_index(avg_sentence_length, percentage_complex_words)
    avg_words_per_sentence = calculate_avg_words_per_sentence(text)
    complex_word_count = count_complex_words(cleaned_tokens)
    syllables_per_word = calculate_syllables_per_word(cleaned_tokens)
    personal_pronouns = count_personal_pronouns(text)
    avg_word_length = calculate_avg_word_length(cleaned_tokens)

    return {
        "POSITIVE SCORE": positive_score,
        "NEGATIVE SCORE": negative_score,
        "POLARITY SCORE": polarity_score,
        "SUBJECTIVITY SCORE": subjectivity_score,
        "AVG SENTENCE LENGTH": avg_sentence_length,
        "PERCENTAGE OF COMPLEX WORDS": percentage_complex_words,
        "FOG INDEX": fog_index,
        "AVG NUMBER OF WORDS PER SENTENCE": avg_words_per_sentence,
        "COMPLEX WORD COUNT": complex_word_count,
        "WORD COUNT": total_words,
        "SYLLABLE PER WORD": syllables_per_word,
        "PERSONAL PRONOUNS": personal_pronouns,
        "AVG WORD LENGTH": avg_word_length
    }

Get extracted text

In [37]:
def extract_articles_from_dir(dir_path):
    articles = {}
    for filename in os.listdir(dir_path):
        if filename.endswith(".txt"):
            with open(os.path.join(dir_path, filename), 'r', encoding='utf-8') as file:
                lines = file.readlines()
                if len(lines) >= 2:
                    title = lines[0].strip()
                    text = ' '.join(lines[1:]).strip()
                    articles[filename.replace('.txt', '')] = (title, text)
    return articles

Start analysis

In [38]:
def process_and_save_results(input_excel, output_excel, articles_dir):
    input_df = pd.read_excel(input_excel)

    articles = extract_articles_from_dir(articles_dir)
    
    results = []

    for index, row in input_df.iterrows():
        url_id = row['URL_ID']
        if url_id in articles:
            title, text = articles[url_id]
            analysis_result = analyze_text(title, text)
            analysis_result['URL_ID'] = url_id
            results.append(analysis_result)
    
    results_df = pd.DataFrame(results)
    
    output_df = input_df.merge(results_df, on='URL_ID', how='left')

    for col in results_df.columns:
        if col in output_df.columns:
            output_df[col] = results_df[col]

    columns_order = ['URL_ID', 'URL', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE',
                    'SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS',
                    'FOG INDEX', 'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT',
                    'WORD COUNT', 'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH']
    
    output_df = output_df[columns_order]
    output_df.to_excel(output_excel, index=False)

In [39]:
input_excel = 'Input.xlsx'
output_excel = 'Output Data Structure.xlsx'
articles_dir = 'extracted_articles'

process_and_save_results(input_excel, output_excel, articles_dir)

Sample Result

In [40]:
df = pd.read_excel(output_excel)
df.head()



Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,33,5,0.736842,0.071028,22.186441,0.478505,9.065978,22.186441,256,535,2.24486,12,6.968224
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,57,27,0.357143,0.107417,26.387097,0.609974,10.798828,26.387097,477,782,2.566496,6,7.636829
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...,38,24,0.225806,0.100977,25.847826,0.661238,10.603626,25.847826,406,614,2.872964,13,8.433225
