In [None]:
import openpyxl
from bs4 import BeautifulSoup

# Load the input Excel file
input_file = 'input.xlsx'
workbook = openpyxl.load_workbook(input_file)
sheet = workbook.active

# Create a list to store the extracted articles
articles = []

# Iterate over the rows in the Excel file
for row in sheet.iter_rows(values_only=True):
    url_id = row[0]
    html_content = row[1]
    
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Extract the article text (assuming the article is contained in a <div> with class "article")
    article_div = soup.find('div', class_='article')
    article_text = article_div.get_text(strip=True) if article_div else ''
    
    # Append the URL_ID and article text to the list
    articles.append((url_id, article_text))

# Close the input Excel file
workbook.close()


In [None]:
from textblob import TextBlob
import re
import math

# Function to calculate the FOG index
def calculate_fog_index(avg_sentence_length, percentage_complex_words):
    return 0.4 * (avg_sentence_length + percentage_complex_words)

# Function to count the number of syllables in a word
def count_syllables(word):
    # Source: https://eayd.in/?p=232
    vowels = 'aeiouy'
    word = word.lower().strip(".:;?!")
    if word[0] in vowels:
        word = f" {word}"
    return len(re.findall('[^aeiouy][aeiouy]', word))

# Create a list to store the computed variables for each article
computed_variables = []

# Iterate over the extracted articles
for url_id, article_text in articles:
    # Create a TextBlob object for the article text
    blob = TextBlob(article_text)
    
    # Compute the required variables
    positive_score = blob.sentiment.polarity
    negative_score = -positive_score
    polarity_score = blob.sentiment.polarity
    subjectivity_score = blob.sentiment.subjectivity
    
    sentences = blob.sentences
    num_sentences = len(sentences)
    num_words = len(blob.words)
    num_syllables = sum(count_syllables(word) for word in blob.words)
    num_complex_words = sum(1 for word in blob.words if count_syllables(word) > 2)
    
    avg_sentence_length = num_words / num_sentences
    percentage_complex_words = (num_complex_words / num_words) * 100
    fog_index = calculate_fog_index(avg_sentence_length, percentage_complex_words)
    avg_words_per_sentence = num_words / num_sentences
    avg_word_length = sum(len(word) for word in blob.words) / num_words
    
    personal_pronouns = len([word for word, pos in blob.tags if pos == 'PRP' or pos == 'PRP$'])
    
    # Append the computed variables to the list
    computed_variables.append((
        url_id, positive_score, negative_score, polarity_score, subjectivity_score,
        avg_sentence_length, percentage_complex_words, fog_index,
        avg_words_per_sentence, num_complex_words, num_words, num_syllables,
        personal_pronouns, avg_word_length
    ))


In [None]:
# Load the output structure Excel file
output_file = 'Output Data Structure.xlsx'
output_workbook = openpyxl.load_workbook(output_file)
output_sheet = output_workbook.active

# Write the computed variables to the output Excel file
for i, data in enumerate(computed_variables, start=2):
    output_sheet.cell(row=i, column=1, value=data[0])  # URL_ID
    
    # Write the computed variables in the specified order
    for j, value in enumerate(data[1:], start=2):
        output_sheet.cell(row=i, column=j, value=value)

# Save and close the output Excel file
output_workbook.save('output.xlsx')
output_workbook.close()


In [None]:
import openpyxl
from bs4 import BeautifulSoup
import re
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import cmudict

# Load the CMU pronunciation dictionary for syllable count
nltk.download('cmudict')
pronunciation_dict = cmudict.dict()

# Load the input Excel file
input_file = 'input.xlsx'
workbook = openpyxl.load_workbook(input_file)
sheet = workbook.active

# Create a list to store the extracted articles
articles = []

# Iterate over the rows in the Excel file
for row in sheet.iter_rows(values_only=True):
    url_id = row[0]
    html_content = row[1]
    
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Extract the article text (assuming the article is contained in a <div> with class "article")
    article_div = soup.find('div', class_='article')
    article_text = article_div.get_text(strip=True) if article_div else ''
    
    # Append the URL_ID and article text to the list
    articles.append((url_id, article_text))

# Close the input Excel file
workbook.close()

# Create a SentimentIntensityAnalyzer object
sia = SentimentIntensityAnalyzer()

# Create a list to store the computed variables for each article
computed_variables = []

# Iterate over the extracted articles
for url_id, article_text in articles:
    # Compute the sentiment scores
    sentiment_scores = sia.polarity_scores(article_text)
    
    # Tokenize the article into sentences and words
    sentences = sent_tokenize(article_text)
    words = word_tokenize(article_text)
    
    num_sentences = len(sentences)
    num_words = len(words)
    
    # Compute the number of syllables for each word
    num_syllables = sum([len(pronunciation_dict.get(word.lower(), [[]]))[0] for word in words])
    
    # Compute the number of complex words (words with more than two syllables)
    num_complex_words = sum([1 for word in words if len(pronunciation_dict.get(word.lower(), [[]]))[0] > 2])
    
    # Compute the average sentence length (number of words per sentence)
    avg_sentence_length = num_words / num_sentences
    
    # Compute the percentage of complex words
    percentage_complex_words = (num_complex_words / num_words) * 100
    
    # Compute the FOG index
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    
    # Compute the average number of words per sentence
    avg_words_per_sentence = num_words / num_sentences
    
    # Compute the average word length
    avg_word_length = sum(len(word) for word in words) / num_words
    
    # Compute the number of personal pronouns (using pattern matching)
    personal_pronouns = len(re.findall(r'\b(I|me|my|mine|we|us|our|ours|you|your|yours)\b', article_text, re.IGNORECASE))
    
    # Append the computed variables to the list
    computed_variables.append((
        url_id, sentiment_scores['pos'], sentiment_scores['neg'], sentiment_scores['compound'],
        sentiment_scores['compound'], avg_sentence_length, percentage_complex_words,
        fog_index, avg_words_per_sentence, num_complex_words, num_words, num_syllables,
        personal_pronouns, avg_word_length
    ))

# Load the output structure Excel file
output_file = 'Output Data Structure.xlsx'
output_workbook = openpyxl.load_workbook(output_file)
output_sheet = output_workbook.active

# Write the computed variables to the output Excel file
for i, data in enumerate(computed_variables, start=2):
    output_sheet.cell(row=i, column=1, value=data[0])  # URL_ID
    
    # Write the computed variables in the specified order
    for j, value in enumerate(data[1:], start=2):
        output_sheet.cell(row=i, column=j, value=value)

# Save and close the output Excel file
output_workbook.save('output.xlsx')
output_workbook.close()


In [None]:
import nltk

nltk.download('vader_lexicon')
nltk.download('punkt')


In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from textblob import TextBlob

# Step 1: Read the input Excel file using pandas
input_file = "input.xlsx"
output_file = "output.xlsx"
input_data = pd.read_excel(input_file)

# Create an empty DataFrame to store the computed variables
output_data = pd.DataFrame(columns=[
    "URL_ID", "POSITIVE SCORE", "NEGATIVE SCORE", "POLARITY SCORE", "SUBJECTIVITY SCORE",
    "AVG SENTENCE LENGTH", "PERCENTAGE OF COMPLEX WORDS", "FOG INDEX",
    "AVG NUMBER OF WORDS PER SENTENCE", "COMPLEX WORD COUNT", "WORD COUNT",
    "SYLLABLE PER WORD", "PERSONAL PRONOUNS", "AVG WORD LENGTH"
])

# Loop through each row of the input file
for index, row in input_data.iterrows():
    url_id = row["URL_ID"]
    article_url = row["Article_URL"]

    # Step 2: Extract the article text using requests and BeautifulSoup libraries
    response = requests.get(article_url)
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Extract the article title and text
    article_title = soup.find("title").text
    article_text = soup.find("body").text

    # Step 3: Clean the extracted text by removing unwanted characters, headers, footers, etc.
    # Add your cleaning logic here

    # Step 4: Perform textual analysis on the cleaned text
    blob = TextBlob(article_text)
    positive_score = blob.sentiment.polarity
    negative_score = -positive_score
    polarity_score = blob.sentiment.polarity
    subjectivity_score = blob.sentiment.subjectivity
    avg_sentence_length = blob.sentences.mean(lambda sentence: len(sentence.split()))
    words = blob.words
    total_words = len(words)
    complex_words = [word for word in words if len(word) > 2 and len(word.syllables) > 2]
    complex_word_count = len(complex_words)
    percentage_complex_words = (complex_word_count / total_words) * 100
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    avg_words_per_sentence = total_words / len(blob.sentences)
    syllable_count = sum(len(word.syllables) for word in words) / total_words
    personal_pronouns = sum(1 for word in words if word.lower() in ["i", "me", "my", "mine", "we", "us", "our", "ours"])
    avg_word_length = sum(len(word) for word in words) / total_words

    # Step 5: Store the computed variables in the output DataFrame
    output_data = output_data.append({
        "URL_ID": url_id,
        "POSITIVE SCORE": positive_score,
        "NEGATIVE SCORE": negative_score,
        "POLARITY SCORE": polarity_score,
        "SUBJECTIVITY SCORE": subjectivity_score,
        "AVG SENTENCE LENGTH": avg_sentence_length,
        "PERCENTAGE OF COMPLEX WORDS": percentage_complex_words,
        "FOG INDEX": fog_index,
        "AVG NUMBER OF WORDS PER SENTENCE": avg_words_per_sentence,
        "COMPLEX WORD COUNT": complex_word_count,
        "WORD COUNT": total_words,
        "SYLLABLE PER WORD": syllable_count,
        "PERSONAL PRONOUNS": personal_pronouns,
        "AVG WORD LENGTH": avg_word_length
    }, ignore_index=True)

# Step 6: Save the DataFrame to an output Excel file
output_data.to_excel(output_file, index=False)


In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from textblob import TextBlob
from nltk.corpus import cmudict

# Step 1: Data Extraction
input_file = "input.xlsx"
output_file = "Output Data Structure.xlsx"

# Read input file
input_data = pd.read_excel(input_file)

# Iterate over URLs and extract article text
for index, row in input_data.iterrows():
    url_id = row['URL_ID']
    url = row['URL']

    # Send a GET request to the URL
    response = requests.get(url)

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract article title and text
    article_title = soup.find('h1').text
    article_text = soup.find('article').text

    # Save the extracted article in a text file
    with open(f"{url_id}.txt", "w", encoding="utf-8") as file:
        file.write(article_title + "\n" + article_text)

# Step 2: Data Analysis
# Load CMU Pronouncing Dictionary for syllable count
cmud = cmudict.dict()

# Read the extracted articles and perform analysis
output_data = []
for index, row in input_data.iterrows():
    url_id = row['URL_ID']
    file_name = f"{url_id}.txt"

    # Read the article text from the text file
    with open(file_name, "r", encoding="utf-8") as file:
        article_text = file.read()

    # Perform text analysis using TextBlob
    blob = TextBlob(article_text)

    # Calculate variables
    positive_score = blob.sentiment.polarity
    negative_score = -blob.sentiment.polarity
    polarity_score = blob.sentiment.polarity
    subjectivity_score = blob.sentiment.subjectivity

    sentences = blob.sentences
    word_count = len(blob.words)
    sentence_count = len(sentences)
    syllable_count = sum([len(cmud[word.lower()][0]) for word in blob.words if word.lower() in cmud])

    avg_sentence_length = word_count / sentence_count
    complex_words_count = len([word for word in blob.words if len(cmud[word.lower()]) > 1])
    percentage_complex_words = (complex_words_count / word_count) * 100
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)

    avg_words_per_sentence = word_count / sentence_count
    syllables_per_word = syllable_count / word_count
    personal_pronouns = blob.word_counts['i'] + blob.word_counts['me'] + blob.word_counts['my'] + blob.word_counts['mine']

    avg_word_length = sum(len(word) for word in blob.words) / word_count

    # Add the variables to the output data list
    output_data.append([url_id, positive_score, negative_score, polarity_score, subjectivity_score, avg_sentence_length,
                        percentage_complex_words, fog_index, avg_words_per_sentence, complex_words_count, word_count,
                        syllables_per_word, personal_pronouns, avg_word_length])

# Step 3: Save the output data to an Excel file
output_df = pd.DataFrame(output_data, columns=['URL_ID', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE',
                                               'SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS',
                                               'FOG INDEX', 'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT',
                                               'WORD COUNT', 'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH'])
output_df.to_excel(output_file, index=False)


In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from textblob import TextBlob
from nltk.corpus import cmudict

# Step 1: Data Extraction
input_file = "input.xlsx"
output_file = "Output Data Structure.xlsx"

# Read input file
input_data = pd.read_excel(input_file)

# Iterate over URLs and extract article text
for index, row in input_data.iterrows():
    url_id = row['URL_ID']
    url = row['URL']

    # Send a GET request to the URL
    response = requests.get(url)

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the article content based on specific selectors or techniques
    # Modify the selectors or techniques based on the HTML structure of the articles
    article_content = soup.find('div', class_='article-content')  # Example selector

    # Extract article title and text
    if article_content:
        article_title = article_content.find('h1').text.strip()
        article_text = article_content.find('div', class_='article-text').text.strip()

        # Save the extracted article in a text file
        with open(f"{url_id}.txt", "w", encoding="utf-8") as file:
            file.write(article_title + "\n" + article_text)
    else:
        print(f"Could not extract content from URL: {url}")

# Step 2: Data Analysis
# Load CMU Pronouncing Dictionary for syllable count
cmud = cmudict.dict()

# Read the extracted articles and perform analysis
output_data = []
for index, row in input_data.iterrows():
    url_id = row['URL_ID']
    file_name = f"{url_id}.txt"

    # Read the article text from the text file
    with open(file_name, "r", encoding="utf-8") as file:
        article_text = file.read()

    # Perform text analysis using TextBlob
    blob = TextBlob(article_text)

    # Calculate variables
    positive_score = blob.sentiment.polarity
    negative_score = -blob.sentiment.polarity
    polarity_score = blob.sentiment.polarity
    subjectivity_score = blob.sentiment.subjectivity
    sentences = blob.sentences
    word_count = len(blob.words)
    
    sentence_count = len(sentences)
    syllable_count = sum([len(cmud[word.lower()][0]) for word in blob.words if word.lower() in cmud])
    cmud['word'] = syllable_count
    
    avg_sentence_length = word_count / sentence_count
    complex_words_count = len([word for word in blob.words if len(cmud[word.lower()]) > 1])
    percentage_complex_words = (complex_words_count / word_count) * 100
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)

    avg_words_per_sentence = word_count / sentence_count
    syllables_per_word = syllable_count / word_count
    personal_pronouns = blob.word_counts['i'] + blob.word_counts['me'] + blob.word_counts['my'] + blob.word_counts['mine']

    avg_word_length = sum(len(word) for word in blob.words) / word_count

    # Add the variables to the output data list
    output_data.append([url_id, positive_score, negative_score, polarity_score, subjectivity_score, avg_sentence_length,
                        percentage_complex_words, fog_index, avg_words_per_sentence, complex_words_count, word_count,
                        syllables_per_word, personal_pronouns, avg_word_length])

# Step 3: Save the output data to an Excel file
output_df = pd.DataFrame(output_data, columns=['URL_ID', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE',
                                               'SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS',
                                               'FOG INDEX', 'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT',
                                               'WORD COUNT', 'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH'])
output_df.to_excel(output_file, index=False)


Could not extract content from URL: https://insights.blackcoffer.com/ai-in-healthcare-to-improve-patient-outcomes/
Could not extract content from URL: https://insights.blackcoffer.com/what-if-the-creation-is-taking-over-the-creator/
Could not extract content from URL: https://insights.blackcoffer.com/what-jobs-will-robots-take-from-humans-in-the-future/
Could not extract content from URL: https://insights.blackcoffer.com/will-machine-replace-the-human-in-the-future-of-work/
Could not extract content from URL: https://insights.blackcoffer.com/will-ai-replace-us-or-work-with-us/
Could not extract content from URL: https://insights.blackcoffer.com/man-and-machines-together-machines-are-more-diligent-than-humans-blackcoffe/
Could not extract content from URL: https://insights.blackcoffer.com/in-future-or-in-upcoming-years-humans-and-machines-are-going-to-work-together-in-every-field-of-work/
Could not extract content from URL: https://insights.blackcoffer.com/how-neural-networks-can-be-app

Could not extract content from URL: https://insights.blackcoffer.com/what-is-the-repercussion-of-the-environment-due-to-the-covid-19-pandemic-situation/
Could not extract content from URL: https://insights.blackcoffer.com/what-is-the-repercussion-of-the-environment-due-to-the-covid-19-pandemic-situation-2/
Could not extract content from URL: https://insights.blackcoffer.com/impact-of-covid-19-pandemic-on-office-space-and-co-working-industries/
Could not extract content from URL: https://insights.blackcoffer.com/contribution-of-handicrafts-visual-arts-literature-in-the-indian-economy/
Could not extract content from URL: https://insights.blackcoffer.com/how-covid-19-is-impacting-payment-preferences/
Could not extract content from URL: https://insights.blackcoffer.com/how-will-covid-19-affect-the-world-of-work-2/
Could not extract content from URL: https://insights.blackcoffer.com/lessons-from-the-past-some-key-learnings-relevant-to-the-coronavirus-crisis/
Could not extract content from U

NameError: name 'syllable_count' is not defined