In [24]:
import os
import openpyxl
import requests
from bs4 import BeautifulSoup
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import sent_tokenize

# Step 1: Read input data from "Input.xlsx"
input_file = "Input.xlsx"
output_file = "Output Data Structure.xlsx"

# Load input data from Excel file
workbook = openpyxl.load_workbook(input_file)
sheet = workbook.active

# Get the URLs from the input file
urls = []
for row in sheet.iter_rows(min_row=2, values_only=True):
    url_id, url = row
    urls.append((url_id, url))

# Step 2: Extract article text from URLs
def extract_article_text(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract the article text by removing unwanted elements
    article_elements = soup.find_all('article')  # Adjust this based on the HTML structure
    article_text = ' '.join([element.get_text() for element in article_elements])
    return article_text.strip()

# Step 3: Save extracted article text in separate files
output_folder = "ExtractedTexts"
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

for url_id, url in urls:
    article_text = extract_article_text(url)
    output_filename = os.path.join(output_folder, f"{url_id}.txt")

    with open(output_filename, 'w', encoding='utf-8') as file:
        file.write(article_text)

# Step 4: Perform text analysis
def calculate_positive_score(text):
    sid = SentimentIntensityAnalyzer()
    sentiment_scores = sid.polarity_scores(text)
    return sentiment_scores['pos']

def calculate_negative_score(text):
    sid = SentimentIntensityAnalyzer()
    sentiment_scores = sid.polarity_scores(text)
    return sentiment_scores['neg']

def calculate_polarity_score(text):
    sid = SentimentIntensityAnalyzer()
    sentiment_scores = sid.polarity_scores(text)
    return sentiment_scores['compound']

def calculate_subjectivity_score(text):
    sid = SentimentIntensityAnalyzer()
    sentiment_scores = sid.polarity_scores(text)
    return sentiment_scores['compound']

def calculate_avg_sentence_length(text):
    sentences = sent_tokenize(text)
    if len(sentences) == 0:
        return 0
    total_words = sum(len(word_tokenize(sentence)) for sentence in sentences)
    if total_words == 0:
        return 0
    return total_words / len(sentences)


def calculate_percentage_complex_words(text):
    tokens = word_tokenize(text)
    if len(tokens) == 0:
        return 0
    complex_words = [word for word in tokens if len(word) > 2 and word.isalpha()]
    return (len(complex_words) / len(tokens)) * 100


def calculate_fog_index(text):
    sentences = sent_tokenize(text)
    total_sentences = len(sentences)
    if total_sentences == 0:
        return 0
    total_words = sum(len(word_tokenize(sentence)) for sentence in sentences)
    if total_words == 0:
        return 0
    complex_words = [word for word in word_tokenize(text) if len(word) > 2 and word.isalpha()]
    num_complex_words = len(complex_words)
    return 0.4 * ((total_words / total_sentences) + 100 * (num_complex_words / total_words))


def calculate_avg_words_per_sentence(text):
    sentences = sent_tokenize(text)
    total_sentences = len(sentences)
    if total_sentences == 0:
        return 0
    total_words = sum(len(word_tokenize(sentence)) for sentence in sentences)
    return total_words / total_sentences


def calculate_complex_word_count(text):
    tokens = word_tokenize(text)
    complex_words = [word for word in tokens if len(word) > 2 and word.isalpha()]
    return len(complex_words)



In [25]:
import nltk
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Akhil\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Akhil\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Akhil\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [40]:
def calculate_complex_word_count(text):
    tokens = word_tokenize(text)
    complex_words = [word for word in tokens if len(word) > 2 and word.isalpha()]
    return len(complex_words)

def calculate_word_count(text):
    tokens = word_tokenize(text)
    return len(tokens)

def calculate_syllables_per_word(text):
    tokens = word_tokenize(text)
    if len(tokens) == 0:
        return 0
    syllable_count = 0
    for token in tokens:
        syllable_count += count_syllables(token)
    return syllable_count / len(tokens)


def calculate_personal_pronouns(text):
    tokens = word_tokenize(text)
    pronouns = [word for word, tag in pos_tag(tokens) if tag == 'PRP' or tag == 'PRP$']
    return len(pronouns)

def calculate_avg_word_length(text):
    tokens = word_tokenize(text)
    if len(tokens) == 0:
        return 0
    total_length = sum(len(word) for word in tokens)
    return total_length / len(tokens)


# Helper function to count syllables in a word
def count_syllables(word):
    vowels = 'aeiouy'
    count = 0
    word = word.lower().strip(".:;?!")

    if not word:
        return count

    if word[0] in vowels:
        count += 1

    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1

    if word.endswith('e'):
        count -= 1

    if word.endswith('le') and len(word) > 2 and word[-3] not in vowels:
        count += 1

    if count == 0:
        count += 1

    return count


# Step 5: Compute variables and save in output file
output_workbook = openpyxl.Workbook()
output_sheet = output_workbook.active

# Write headers in the output file
output_headers = [
    "URL_ID",
    "POSITIVE SCORE",
    "NEGATIVE SCORE",
    "POLARITY SCORE",
    "SUBJECTIVITY SCORE",
    "AVG SENTENCE LENGTH",
    "PERCENTAGE OF COMPLEX WORDS",
    "FOG INDEX",
    "AVG NUMBER OF WORDS PER SENTENCE",
    "COMPLEX WORD COUNT",
    "WORD COUNT",
    "SYLLABLE PER WORD",
    "PERSONAL PRONOUNS",
    "AVG WORD LENGTH"
]
output_sheet.append(output_headers)

import os

# Directory paths for stop words files and master dictionary files
stop_words_folder = "StopWords"
master_dict_folder = "MasterDictionary"

# Load stop words from files in the folder
stop_words = set()
for filename in os.listdir(stop_words_folder):
    file_path = os.path.join(stop_words_folder, filename)
    with open(file_path, 'r', encoding='latin-1') as file:
        for line in file:
            stop_words.add(line.strip())

def remove_stop_words(text, stop_words):
    """
    Remove stop words from the given text.

    Args:
        text (str): Input text.
        stop_words (set): Set of stop words to remove.

    Returns:
        str: Text with stop words removed.
    """
    words = text.split()
    cleaned_words = [word for word in words if word.lower() not in stop_words]
    cleaned_text = " ".join(cleaned_words)
    return cleaned_text

# Load master dictionary from files in the folder
master_dict = set()
for filename in os.listdir(master_dict_folder):
    file_path = os.path.join(master_dict_folder, filename)
    with open(file_path, 'r', encoding='latin-1') as file:
        for line in file:
            master_dict.add(line.strip())

for url_id, url in urls:
    # Read the extracted article text from file
    input_filename = os.path.join(output_folder, f"{url_id}.txt")
    with open(input_filename, 'r', encoding='utf-8') as file:
        article_text = file.read()

    # Clean the text by removing stop words
    cleaned_text = remove_stop_words(article_text, stop_words)

    # Calculate the variables
    positive_score = calculate_positive_score(cleaned_text)
    negative_score = calculate_negative_score(cleaned_text)
    polarity_score = calculate_polarity_score(cleaned_text)
    subjectivity_score = calculate_subjectivity_score(cleaned_text)
    avg_sentence_length = calculate_avg_sentence_length(cleaned_text)
    percentage_complex_words = calculate_percentage_complex_words(cleaned_text)
    fog_index = calculate_fog_index(cleaned_text)
    avg_words_per_sentence = calculate_avg_words_per_sentence(cleaned_text)
    complex_word_count = calculate_complex_word_count(cleaned_text)
    word_count = calculate_word_count(cleaned_text)
    syllable_per_word = calculate_syllables_per_word(cleaned_text)
    personal_pronouns = calculate_personal_pronouns(cleaned_text)
    avg_word_length = calculate_avg_word_length(cleaned_text)

    # Prepare the row data for the output file
    output_row = [
        url_id,
        positive_score,
        negative_score,
        polarity_score,
        subjectivity_score,
        avg_sentence_length,
        percentage_complex_words,
        fog_index,
        avg_words_per_sentence,
        complex_word_count,
        word_count,
        syllable_per_word,
        personal_pronouns,
        avg_word_length
    ]

    # Write the row data to the output file
    output_sheet.append(output_row)


In [41]:
import pandas as pd

df = pd.read_excel("Output Data Structure.xlsx")
df

Unnamed: 0,URL_ID,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,37,0.146,0.040,0.9997,0.9997,28.987013,71.146953,40.053587,28.987013,1588,2232,1.771057,41,5.176523
1,38,0.153,0.076,0.9991,0.9991,22.192771,66.829533,35.608922,22.192771,1231,1842,1.518458,74,4.384365
2,39,0.077,0.035,0.9939,0.9939,23.263736,69.957487,37.288489,23.263736,1481,2117,1.717997,45,4.930562
3,40,0.140,0.042,0.9995,0.9995,20.938144,68.685377,35.849408,20.938144,1395,2031,1.539636,65,4.507632
4,41,0.145,0.047,0.9997,0.9997,26.059524,68.067611,37.650854,26.059524,1490,2189,1.590224,74,4.656464
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109,146,0.133,0.044,0.9985,0.9985,24.102041,68.924640,37.210672,24.102041,814,1181,1.687553,34,5.041490
110,147,0.111,0.018,0.9995,0.9995,30.515625,65.642601,38.463290,30.515625,1282,1953,1.590886,40,4.682028
111,148,0.100,0.080,0.9533,0.9533,21.602941,67.665078,35.707208,21.602941,994,1469,1.640572,22,4.724302
112,149,0.159,0.011,0.9990,0.9990,32.300000,71.310630,41.444252,32.300000,691,969,1.786378,26,5.250774
