In [25]:
import requests
from bs4 import BeautifulSoup
import os
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
import re


In [26]:
def clean_text(text):
   
    cleaned_text = re.sub(r'\s+', ' ', text).strip()
    return cleaned_text

In [27]:
#Extracting the Title and the main text
def extract_article_text(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        for element in soup.find_all(['script', 'style', 'header', 'footer']):
            element.extract()

        # extract the article title
        title = soup.title.string.strip() if soup.title else ''

        # article text
        article_text = ""
        for paragraph in soup.find_all('p'):
            article_text += paragraph.get_text() + '\n'

        # Clean the extracted text
        title = clean_text(title)
        article_text = clean_text(article_text)

        return title, article_text
    except Exception as e:
        print(f"Error extracting text from {url}: {str(e)}")
        return None, None






In [28]:
# Dictionary of URL_IDs and URLs
url_data = {
    123: "https://insights.blackcoffer.com/rise-of-telemedicine-and-its-impact-on-livelihood-by-2040-3-2/",
    321: "https://insights.blackcoffer.com/rise-of-e-health-and-its-impact-on-humans-by-the-year-2030/",
    2345: "https://insights.blackcoffer.com/rise-of-e-health-and-its-imapct-on-humans-by-the-year-2030-2/",
    4321: "https://insights.blackcoffer.com/rise-of-telemedicine-and-its-impact-on-livelihood-by-2040-2/",
    432: "https://insights.blackcoffer.com/rise-of-telemedicine-and-its-impact-on-livelihood-by-2040-2-2/",
    2893.8: "https://insights.blackcoffer.com/rise-of-chatbots-and-its-impact-on-customer-support-by-the-year-2040/",
    3355.6: "https://insights.blackcoffer.com/rise-of-e-health-and-its-imapct-on-humans-by-the-year-2030/",
    3817.4: "https://insights.blackcoffer.com/how-does-marketing-influence-businesses-and-consumers/",
    4279.2: "https://insights.blackcoffer.com/how-advertisement-increase-your-market-value/",
    4741: "https://insights.blackcoffer.com/negative-effects-of-marketing-on-society/",
    5202.8: "https://insights.blackcoffer.com/how-advertisement-marketing-affects-business/",
    5664.6: "https://insights.blackcoffer.com/rising-it-cities-will-impact-the-economy-environment-infrastructure-and-city-life-by-the-year-2035/",
    6126.4: "https://insights.blackcoffer.com/rise-of-ott-platform-and-its-impact-on-entertainment-industry-by-the-year-2030/",
    6588.2: "https://insights.blackcoffer.com/rise-of-electric-vehicles-and-its-impact-on-livelihood-by-2040/",
    7050: "https://insights.blackcoffer.com/rise-of-electric-vehicle-and-its-impact-on-livelihood-by-the-year-2040/",
    7511.8: "https://insights.blackcoffer.com/oil-prices-by-the-year-2040-and-how-it-will-impact-the-world-economy/",
    7973.6:"https://insights.blackcoffer.com/an-outlook-of-healthcare-by-the-year-2040-and-how-it-will-impact-human-lives/",
8435.4:"https://insights.blackcoffer.com/ai-in-healthcare-to-improve-patient-outcomes/",
8897.2:"https://insights.blackcoffer.com/what-if-the-creation-is-taking-over-the-creator/",
9359:"https://insights.blackcoffer.com/what-jobs-will-robots-take-from-humans-in-the-future/",
9820.8:"https://insights.blackcoffer.com/will-machine-replace-the-human-in-the-future-of-work/",
10282.6:"https://insights.blackcoffer.com/will-ai-replace-us-or-work-with-us/",
10744.4:"https://insights.blackcoffer.com/man-and-machines-together-machines-are-more-diligent-than-humans-blackcoffe/",
11206.2:"https://insights.blackcoffer.com/in-future-or-in-upcoming-years-humans-and-machines-are-going-to-work-together-in-every-field-of-work/",
11668:"https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/",
12129.8:"https://insights.blackcoffer.com/how-machine-learning-will-affect-your-business/",
12591.6:"https://insights.blackcoffer.com/deep-learning-impact-on-areas-of-e-learning/",
13053.4:"https://insights.blackcoffer.com/how-to-protect-future-data-and-its-privacy-blackcoffer/",
13515.2:"https://insights.blackcoffer.com/how-machines-ai-automations-and-robo-human-are-effective-in-finance-and-banking/",
13977:"https://insights.blackcoffer.com/ai-human-robotics-machine-future-planet-blackcoffer-thinking-jobs-workplace/",
14438.8:"https://insights.blackcoffer.com/how-ai-will-change-the-world-blackcoffer/",
14900.6:"https://insights.blackcoffer.com/future-of-work-how-ai-has-entered-the-workplace/",
15362.4:"https://insights.blackcoffer.com/ai-tool-alexa-google-assistant-finance-banking-tool-future/",
15824.2:"https://insights.blackcoffer.com/ai-healthcare-revolution-ml-technology-algorithm-google-analytics-industrialrevolution/",
16286:"https://insights.blackcoffer.com/all-you-need-to-know-about-online-marketing/",
16747.8:"https://insights.blackcoffer.com/evolution-of-advertising-industry/",
17209.6:"https://insights.blackcoffer.com/how-data-analytics-can-help-your-business-respond-to-the-impact-of-covid-19/",
17671.4:"https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/",
18133.2:"https://insights.blackcoffer.com/environmental-impact-of-the-covid-19-pandemic-lesson-for-the-future/",
18595:"https://insights.blackcoffer.com/how-data-analytics-and-ai-are-used-to-halt-the-covid-19-pandemic/",
19056.8:"https://insights.blackcoffer.com/difference-between-artificial-intelligence-machine-learning-statistics-and-data-mining/",
19518.6:"https://insights.blackcoffer.com/how-python-became-the-first-choice-for-data-science/",
19980.4:"https://insights.blackcoffer.com/how-google-fit-measure-heart-and-respiratory-rates-using-a-phone/",
20442.2:"https://insights.blackcoffer.com/what-is-the-future-of-mobile-apps/",
20904:"https://insights.blackcoffer.com/impact-of-ai-in-health-and-medicine/",
21365.8:"https://insights.blackcoffer.com/telemedicine-what-patients-like-and-dislike-about-it/",
21827.6:"https://insights.blackcoffer.com/how-we-forecast-future-technologies/",
22289.4:"https://insights.blackcoffer.com/can-robots-tackle-late-life-loneliness/",
22751.2:"https://insights.blackcoffer.com/embedding-care-robots-into-society-socio-technical-considerations/",
23213:"https://insights.blackcoffer.com/management-challenges-for-future-digitalization-of-healthcare-services/",
23674.8:"https://insights.blackcoffer.com/are-we-any-closer-to-preventing-a-nuclear-holocaust/",
24136.6:"https://insights.blackcoffer.com/will-technology-eliminate-the-need-for-animal-testing-in-drug-development/",
24598.4:"https://insights.blackcoffer.com/will-we-ever-understand-the-nature-of-consciousness/",
25060.2:"https://insights.blackcoffer.com/will-we-ever-colonize-outer-space/",
25522:"https://insights.blackcoffer.com/what-is-the-chance-homo-sapiens-will-survive-for-the-next-500-years/",
25983.8:"https://insights.blackcoffer.com/why-does-your-business-need-a-chatbot/",
26445.6:"https://insights.blackcoffer.com/how-you-lead-a-project-or-a-team-without-any-technical-expertise/",
26907.4:"https://insights.blackcoffer.com/can-you-be-great-leader-without-technical-expertise/",
27369.2:"https://insights.blackcoffer.com/how-does-artificial-intelligence-affect-the-environment/",
27831:"https://insights.blackcoffer.com/how-to-overcome-your-fear-of-making-mistakes-2/",
28292.8:"https://insights.blackcoffer.com/is-perfection-the-greatest-enemy-of-productivity/",
28754.6:"https://insights.blackcoffer.com/global-financial-crisis-2008-causes-effects-and-its-solution/",
29216.4:"https://insights.blackcoffer.com/gender-diversity-and-equality-in-the-tech-industry/",
29678.2:"https://insights.blackcoffer.com/how-to-overcome-your-fear-of-making-mistakes/",
30140:"https://insights.blackcoffer.com/how-small-business-can-survive-the-coronavirus-crisis/",
30601.8:"https://insights.blackcoffer.com/impacts-of-covid-19-on-vegetable-vendors-and-food-stalls/",
31063.6:"https://insights.blackcoffer.com/impacts-of-covid-19-on-vegetable-vendors/",
31525.4:"https://insights.blackcoffer.com/impact-of-covid-19-pandemic-on-tourism-aviation-industries/",
31987.2:"https://insights.blackcoffer.com/impact-of-covid-19-pandemic-on-sports-events-around-the-world/",
32449:"https://insights.blackcoffer.com/changing-landscape-and-emerging-trends-in-the-indian-it-ites-industry/",
32910.8:"https://insights.blackcoffer.com/online-gaming-adolescent-online-gaming-effects-demotivated-depression-musculoskeletal-and-psychosomatic-symptoms/",
33372.6:"https://insights.blackcoffer.com/human-rights-outlook/",
33834.4:"https://insights.blackcoffer.com/how-voice-search-makes-your-business-a-successful-business/",
34296.2:"https://insights.blackcoffer.com/how-the-covid-19-crisis-is-redefining-jobs-and-services/",
34758:"https://insights.blackcoffer.com/how-to-increase-social-media-engagement-for-marketers/",
35219.8:"https://insights.blackcoffer.com/impacts-of-covid-19-on-streets-sides-food-stalls/",
35681.6:"https://insights.blackcoffer.com/coronavirus-impact-on-energy-markets-2/",
36143.4:"https://insights.blackcoffer.com/coronavirus-impact-on-the-hospitality-industry-5/",
36605.2:"https://insights.blackcoffer.com/lessons-from-the-past-some-key-learnings-relevant-to-the-coronavirus-crisis-4/",
37067:"https://insights.blackcoffer.com/estimating-the-impact-of-covid-19-on-the-world-of-work-2/",
37528.8:"https://insights.blackcoffer.com/estimating-the-impact-of-covid-19-on-the-world-of-work-3/",
37990.6:"https://insights.blackcoffer.com/travel-and-tourism-outlook/",
38452.4:"https://insights.blackcoffer.com/gaming-disorder-and-effects-of-gaming-on-health/",
38914.2:"https://insights.blackcoffer.com/what-is-the-repercussion-of-the-environment-due-to-the-covid-19-pandemic-situation/",
39376:"https://insights.blackcoffer.com/what-is-the-repercussion-of-the-environment-due-to-the-covid-19-pandemic-situation-2/",
39837.8:"https://insights.blackcoffer.com/impact-of-covid-19-pandemic-on-office-space-and-co-working-industries/",
40299.6:"https://insights.blackcoffer.com/contribution-of-handicrafts-visual-arts-literature-in-the-indian-economy/",
40761.4:"https://insights.blackcoffer.com/how-covid-19-is-impacting-payment-preferences/",
41223.2:"https://insights.blackcoffer.com/how-will-covid-19-affect-the-world-of-work-2/",
41685:"https://insights.blackcoffer.com/lessons-from-the-past-some-key-learnings-relevant-to-the-coronavirus-crisis/",
42146.8:"https://insights.blackcoffer.com/covid-19-how-have-countries-been-responding/",
42608.6:"https://insights.blackcoffer.com/coronavirus-impact-on-the-hospitality-industry-2/",
43070.4:"https://insights.blackcoffer.com/how-will-covid-19-affect-the-world-of-work-3/",
43532.2:"https://insights.blackcoffer.com/coronavirus-impact-on-the-hospitality-industry-3/",
43994:"https://insights.blackcoffer.com/estimating-the-impact-of-covid-19-on-the-world-of-work/",
44455.8:"https://insights.blackcoffer.com/covid-19-how-have-countries-been-responding-2/",
44917.6:"https://insights.blackcoffer.com/how-will-covid-19-affect-the-world-of-work-4/",
45379.4:"https://insights.blackcoffer.com/lessons-from-the-past-some-key-learnings-relevant-to-the-coronavirus-crisis-2/",
45841.2:"https://insights.blackcoffer.com/lessons-from-the-past-some-key-learnings-relevant-to-the-coronavirus-crisis-3/",
46303:"https://insights.blackcoffer.com/coronavirus-impact-on-the-hospitality-industry-4/",
46764.8:"https://insights.blackcoffer.com/why-scams-like-nirav-modi-happen-with-indian-banks/",
47226.6:"https://insights.blackcoffer.com/impact-of-covid-19-on-the-global-economy/",
47688.4:"https://insights.blackcoffer.com/impact-of-covid-19coronavirus-on-the-indian-economy-2/",
48150.2:"https://insights.blackcoffer.com/impact-of-covid-19-on-the-global-economy-2/",
48612:"https://insights.blackcoffer.com/impact-of-covid-19-coronavirus-on-the-indian-economy-3/",
49073.8:"https://insights.blackcoffer.com/should-celebrities-be-allowed-to-join-politics/",
49535.6:"https://insights.blackcoffer.com/how-prepared-is-india-to-tackle-a-possible-covid-19-outbreak/",
49997.4:"https://insights.blackcoffer.com/how-will-covid-19-affect-the-world-of-work/",
50459.2:"https://insights.blackcoffer.com/controversy-as-a-marketing-strategy/",
50921:"https://insights.blackcoffer.com/coronavirus-impact-on-the-hospitality-industry/",
51382.8:"https://insights.blackcoffer.com/coronavirus-impact-on-energy-markets/",
51844.6:"https://insights.blackcoffer.com/what-are-the-key-policies-that-will-mitigate-the-impacts-of-covid-19-on-the-world-of-work/",
52306.4:"https://insights.blackcoffer.com/marketing-drives-results-with-a-focus-on-problems/",
52768.2:"https://insights.blackcoffer.com/continued-demand-for-sustainability/",
}



In [29]:
# output directory
output_directory = "output"
if not os.path.exists(output_directory):
    os.makedirs(output_directory)


for url_id, url in url_data.items():
    # Extract article text from the URL
    title, article_text = extract_article_text(url)
    
    if title and article_text:
        # Create the file name using URL_ID
        filename = os.path.join(output_directory, f"{url_id}.txt")
        
        # Saving the extracted article to a text file
        with open(filename, 'w', encoding='utf-8') as file:
            file.write(title + '\n\n')
            file.write(article_text)
        print(f"Saved {filename}")

print("Extraction and saving completed.")


Saved output\123.txt
Saved output\321.txt
Saved output\2345.txt
Saved output\4321.txt
Saved output\432.txt
Saved output\2893.8.txt
Saved output\3355.6.txt
Saved output\3817.4.txt
Saved output\4279.2.txt
Saved output\4741.txt
Saved output\5202.8.txt
Saved output\5664.6.txt
Saved output\6126.4.txt
Saved output\6588.2.txt
Saved output\7050.txt
Saved output\7511.8.txt
Saved output\7973.6.txt
Saved output\8435.4.txt
Saved output\8897.2.txt
Saved output\9359.txt
Saved output\9820.8.txt
Saved output\10282.6.txt
Saved output\10744.4.txt
Saved output\11206.2.txt
Saved output\11668.txt
Saved output\12129.8.txt
Saved output\12591.6.txt
Saved output\13053.4.txt
Saved output\13515.2.txt
Saved output\13977.txt
Saved output\14438.8.txt
Saved output\14900.6.txt
Saved output\15362.4.txt
Saved output\15824.2.txt
Saved output\16286.txt
Saved output\16747.8.txt
Saved output\17209.6.txt
Saved output\17671.4.txt
Saved output\18133.2.txt
Saved output\18595.txt
Saved output\19056.8.txt
Saved output\19518.6.tx

In [20]:
!pip install textstat

Collecting textstat
  Downloading textstat-0.7.3-py3-none-any.whl (105 kB)
     ------------------------------------ 105.1/105.1 kB 608.2 kB/s eta 0:00:00
Collecting pyphen
  Downloading pyphen-0.14.0-py3-none-any.whl (2.0 MB)
     ---------------------------------------- 2.0/2.0 MB 728.7 kB/s eta 0:00:00
Installing collected packages: pyphen, textstat
Successfully installed pyphen-0.14.0 textstat-0.7.3


In [11]:
import os
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords


nltk.download('punkt')
nltk.download('vader_lexicon')


stop_words_files = [
    "StopWords_Auditor.txt",
    "StopWords_Currencies.txt",
    "StopWords_DatesandNumbers.txt",
    "StopWords_Generic.txt",
    "StopWords_GenericLong.txt",
    "StopWords_Geographic.txt",
    "StopWords_Names.txt"
]

stop_words = set()
for stop_words_file in stop_words_files:
    with open(os.path.join("StopWords", stop_words_file), 'r', encoding='latin-1') as file:
        stop_words.update(file.read().splitlines())

# ...

# Initialize SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

# list to store sentiment analysis results
sentiment_results = []

output_directory = "output"


for filename in os.listdir(output_directory):
    if filename.endswith(".txt"):
        filepath = os.path.join(output_directory, filename)

       
        with open(filepath, 'r', encoding='utf-8') as file:
            text = file.read()

        # Tokenize the text
        words = word_tokenize(text)

        # Remove stop words
        words = [word for word in words if word.lower() not in stop_words]

        # sentiment analysis
        sentiment = sia.polarity_scores(text)

        positive_score = sentiment['pos']
        negative_score = sentiment['neg']
        polarity_score = (positive_score - negative_score) / (positive_score + negative_score + 0.000001)
        subjectivity_score = (positive_score + negative_score) / (len(words) + 0.000001)

        # Store the sentiment analysis results in a list
        sentiment_results.append({
            'File Name': filename,
            'Positive Score': positive_score,
            'Negative Score': negative_score,
            'Polarity Score': polarity_score,
            'Subjectivity Score': subjectivity_score
        })

# Creating a DataFrame from the sentiment results list
df = pd.DataFrame(sentiment_results)

# Saving the DataFrame to an Excel file
output_file = "sentiment_analysis_results.xlsx"
df.to_excel(output_file, index=False)

print(f"Sentiment analysis results saved to {output_file}")



[nltk_data] Error loading punkt: <urlopen error [WinError 10060] A
[nltk_data]     connection attempt failed because the connected party
[nltk_data]     did not properly respond after a period of time, or
[nltk_data]     established connection failed because connected host
[nltk_data]     has failed to respond>
[nltk_data] Error loading vader_lexicon: <urlopen error [WinError
[nltk_data]     10060] A connection attempt failed because the
[nltk_data]     connected party did not properly respond after a
[nltk_data]     period of time, or established connection failed
[nltk_data]     because connected host has failed to respond>


Sentiment analysis results saved to sentiment_analysis_results.xlsx


In [24]:


nltk.download('punkt')
nltk.download('vader_lexicon')


stop_words_files = [
    "StopWords_Auditor.txt",
    "StopWords_Currencies.txt",
    "StopWords_DatesandNumbers.txt",
    "StopWords_Generic.txt",
    "StopWords_GenericLong.txt",
    "StopWords_Geographic.txt",
    "StopWords_Names.txt"
]

stop_words = set()
for stop_words_file in stop_words_files:
    with open(os.path.join("StopWords", stop_words_file), 'r', encoding='latin-1') as file:
        stop_words.update(file.read().splitlines())

# Initialize SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

# Create a list to store sentiment analysis and readability results
results = []


output_directory = "output"


for filename in os.listdir(output_directory):
    if filename.endswith(".txt"):
        filepath = os.path.join(output_directory, filename)

     
        with open(filepath, 'r', encoding='utf-8') as file:
            text = file.read()

        # Tokenize the text
        words = word_tokenize(text)
        sentences = sent_tokenize(text)

        # Remove stop words and punctuation
        cleaned_words = [word.lower() for word in words if word.lower() not in stop_words and word.isalpha()]

        # Calculate readability metrics
        average_sentence_length = len(words) / len(sentences)
        complex_word_count = sum(1 for word in cleaned_words if len(word) > 2)
        percentage_complex_words = complex_word_count / len(cleaned_words)
        fog_index = 0.4 * (average_sentence_length + percentage_complex_words)

        # Calculate additional readability metrics
        average_word_count_per_sentence = len(cleaned_words) / len(sentences)

        # Count syllables per word
        def count_syllables(word):
            # Count the number of vowels in the word (excluding certain endings)
            word = re.sub(r'ed$', '', word)
            word = re.sub(r'es$', '', word)
            vowels = "aeiouyAEIOUY"
            syllables = 0
            prev_char = ''
            for char in word:
                if char in vowels and prev_char not in vowels:
                    syllables += 1
                prev_char = char
            # Adjust for words with no vowels (e.g., "cry")
            if syllables == 0:
                syllables = 1
            return syllables

        syllable_count_per_word = sum(count_syllables(word) for word in cleaned_words)

        # Count personal pronouns
        personal_pronoun_count = len(re.findall(r'\b(?:I|we|my|ours|us)\b', text, re.IGNORECASE))

        # Calculate average word length
        average_word_length = sum(len(word) for word in cleaned_words) / len(cleaned_words)

        # Append results to the list
        results.append({
            'File Name': filename,
            'Average Sentence Length': average_sentence_length,
            'Complex Word Count': complex_word_count,
            'Percentage of Complex Words': percentage_complex_words,
            'Fog Index': fog_index,
            'Average Number of Words Per Sentence': average_word_count_per_sentence,
            'Syllable Count Per Word': syllable_count_per_word,
            'Personal Pronoun Count': personal_pronoun_count,
            'Average Word Length': average_word_length
        })

df = pd.DataFrame(results)


existing_df = pd.read_excel("sentiment_analysis_results.xlsx")

# Merge the sentiment analysis and readability results on 'File Name'
merged_df = existing_df.merge(df, on='File Name')

# Save the merged DataFrame to an Excel file
merged_df.to_excel("sentiment_analysis_results.xlsx", index=False)

print(f"Readability analysis results appended to sentiment_analysis_results.xlsx")


[nltk_data] Error loading punkt: <urlopen error [WinError 10060] A
[nltk_data]     connection attempt failed because the connected party
[nltk_data]     did not properly respond after a period of time, or
[nltk_data]     established connection failed because connected host
[nltk_data]     has failed to respond>
[nltk_data] Error loading vader_lexicon: <urlopen error [WinError
[nltk_data]     10060] A connection attempt failed because the
[nltk_data]     connected party did not properly respond after a
[nltk_data]     period of time, or established connection failed
[nltk_data]     because connected host has failed to respond>


Readability analysis results appended to sentiment_analysis_results.xlsx
