In [1]:
import os
import random
import re
import warnings

import pandas as pd
import requests
import syllables
from bs4 import BeautifulSoup
from nltk.corpus import stopwords as nltk_sw

warnings.filterwarnings("ignore")

In [2]:
df = pd.read_excel("Input.xlsx")
urls = list(df["URL"])
urls[:10], len(urls)

(['https://insights.blackcoffer.com/rising-it-cities-and-its-impact-on-the-economy-environment-infrastructure-and-city-life-by-the-year-2040-2/',
  'https://insights.blackcoffer.com/rising-it-cities-and-their-impact-on-the-economy-environment-infrastructure-and-city-life-in-future/',
  'https://insights.blackcoffer.com/internet-demands-evolution-communication-impact-and-2035s-alternative-pathways/',
  'https://insights.blackcoffer.com/rise-of-cybercrime-and-its-effect-in-upcoming-future/',
  'https://insights.blackcoffer.com/ott-platform-and-its-impact-on-the-entertainment-industry-in-future/',
  'https://insights.blackcoffer.com/the-rise-of-the-ott-platform-and-its-impact-on-the-entertainment-industry-by-2040/',
  'https://insights.blackcoffer.com/rise-of-cyber-crime-and-its-effects/',
  'https://insights.blackcoffer.com/rise-of-internet-demand-and-its-impact-on-communications-and-alternatives-by-the-year-2035-2/',
  'https://insights.blackcoffer.com/rise-of-cybercrime-and-its-effect-

# fetch_web_data

In [3]:
def fetch_web_data(url):
    class_ = ["td-post-content tagdiv-type", "tdb-block-inner td-fix-index"]
    doc = requests.get(url)
    soup = BeautifulSoup(doc.content, "html.parser")
    title = soup.find("h1")
    article = soup.find_all("div", {"class": class_[0]})
    if article:
        res = " "
        for tag in article:
            res += tag.text.strip()
    else:
        article = soup.find_all("div", {"class": class_[1]})
        res = " "
        for tag in article:
            res += tag.text.strip()
    try:
        start = res.index("Introduction")
        stop = res.index("Blackcoffer Insights")
    except:
        start = 0
        stop = -1
    return title.text + "\n" + res[start:stop]


fetch_web_data(random.choice(urls))[:500]

'Will technology eliminate the need for animal testing in drug development?\n Imagine yourself as a being who can speak but no one can comprehend you. You are kept in captivity, in a place where you can hardly breathe. You are finally selected and taken out of that place, it is a happy moment for you. You can finally have your freedom but sadly instead of being free, you are prodded by rods and fed toxic substances against your will. The only thing you can do is scream in vain. Your voice is heard'

# Stop Words

In [4]:
def get_stop_words():
    StopWords_notNames = []
    for file in os.listdir("StopWords/"):
        if file != "StopWords_Names.txt":
            corpus = open(f"StopWords/{file}", "r").read().strip(" ").split("\n")
            res = []
            for txt in corpus:
                if "|" in txt:
                    res.extend(txt.replace(" | ", ",").replace(" ", "").split(","))
            if res != []:
                StopWords_notNames.extend(res)

    StopWords_Names = []
    for file in os.listdir("StopWords/"):
        if file == "StopWords_Names.txt":
            corpus = open(f"StopWords/{file}", "r").read().strip(" ").split("\n")
            for txt in corpus:
                if "|" in txt:
                    res = txt.replace(" | ", ",").replace(" ", "").split(",")
                    if res != None:
                        StopWords_Names.append(res[0])

    stop_words = []
    for file in os.listdir("StopWords/"):
        corpus = open(f"StopWords/{file}", "r").read().strip().split("\n")
        res = []
        for txt in corpus:
            if "|" in txt:
                txt = txt.replace(txt, txt.split("|")[0])
                res.append(txt.strip())
        if res != []:
            stop_words.extend(res)
        stop_words.extend([txt for txt in corpus if "|" not in txt])

    stop_words.extend(StopWords_notNames)
    stop_words.extend(StopWords_Names)
    return stop_words


get_stop_words()[:10]

['ERNST',
 'YOUNG',
 'DELOITTE',
 'TOUCHE',
 'KPMG',
 'PRICEWATERHOUSECOOPERS',
 'PRICEWATERHOUSE',
 'COOPERS',
 'AFGHANI',
 'ARIARY']

In [5]:
def clean_stop_words(text, personalwords=True):
    stop_words=get_stop_words()
    if personalwords == True:
        stop_words.extend(nltk_sw.words("english"))
    words = text.split()
    cleaned_words = [word for word in words if word.lower() not in stop_words]
    cleaned_text = " ".join(cleaned_words)
    cleaned_text = " ".join(re.findall("[a-zA-Z.]+", cleaned_text))
    return cleaned_text


clean_stop_words(fetch_web_data(random.choice(urls)))[:500]

'Coronavirus impact energy markets coronavirus COVID global public health emergency source significant regional increasingly global economic disruption. impacts energy climate world ways. economic downturn puts pressure global oil prices leading Organization Petroleum Exporting Countries OPEC cuts production. hurts demand natural gas time extremely low prices. economic energy climate policymaking environment China consequential energy consumers sources greenhouse gas emissions. temporarily disrup'

# Scores

In [8]:
def get_scores(text):
    def get_subjectivity_score(text, positive_score, negative_score):
   
        # Clean and split text into words
        cleaned_words = [word.strip().lower() for word in text.split() if word.isalpha()]
        total_words = len(cleaned_words)
        
        # Avoid division by zero by adding a small constant (0.000001)
        subjectivity_score = (positive_score + negative_score) / (total_words + 0.000001)
        
        return subjectivity_score


    def get_polarity_score(text):
        positive_words = (
            open("MasterDictionary/positive-words.txt", "r").read().split("\n")
        )
        negative_words = (
            open("MasterDictionary/negative-words.txt", "r").read().split("\n")
        )

        positive_score = 0
        negative_score = 0

        for word in text.split():
            if word.lower() in positive_words:
                positive_score += 1
            elif word.lower() in negative_words:
                negative_score += 1

        polarity_score = (positive_score - negative_score) / (
            (positive_score + negative_score) + 0.000001
        )
        return polarity_score, positive_score, negative_score

    
    polarity_score, positive_score, negative_score = get_polarity_score(text)
    subjectivity_score = get_subjectivity_score(text, positive_score, negative_score)
    
    return positive_score, negative_score, polarity_score, subjectivity_score


get_scores(clean_stop_words(fetch_web_data(random.choice(urls))))

(35, 42, -0.09090908972845338, 0.1574642123569239)

# Analysis_of_readability

In [9]:
def Analysis_of_readability(fetched_article):
    sentences = fetched_article.replace(" ", "").split(".")
    tokens = fetched_article.split(" ")
    total_num_of_sentences = len(sentences)
    total_num_of_words = len(tokens)

    num_complex_words = 0
    for token in sentences:
        if syllables.estimate(token) > 2:
            num_complex_words += 1

    Average_Sentence_Length = total_num_of_words / total_num_of_sentences
    Percentage_of_Complex_words = num_complex_words / total_num_of_words
    Fog_Index = 0.4 * (Average_Sentence_Length + Percentage_of_Complex_words)

    Average_Number_of_Words_Per_Sentence = total_num_of_words / total_num_of_sentences

    total_syllables = sum(syllables.estimate(word) for word in sentences)
    SYLLABLE_PER_WORD = total_syllables / total_num_of_words
    SYLLABLE_PER_WORD

    return (
        num_complex_words,
        Average_Sentence_Length,
        Percentage_of_Complex_words,
        Fog_Index,
        Average_Number_of_Words_Per_Sentence,
        SYLLABLE_PER_WORD,
    )


Analysis_of_readability(clean_stop_words(fetch_web_data(random.choice(urls))))

(55,
 7.418181818181818,
 0.13480392156862744,
 3.0211942959001785,
 7.418181818181818,
 2.0)

# get_personal_pronouns

In [10]:
def get_personal_pronouns(tokens):
    personal_pronouns = [
        "I",
        "me",
        "my",
        "mine",
        "you",
        "your",
        "yours",
        "he",
        "him",
        "his",
        "she",
        "her",
        "hers",
        "it",
        "its",
        "we",
        "us",
        "our",
        "ours",
        "they",
        "them",
        "their",
        "theirs",
    ]
    num_personal_pronouns = sum(
        [1 for word in tokens if word.lower() in personal_pronouns]
    )

    total_chars = sum(len(word) for word in tokens)
    avg_word_length = total_chars / len(tokens)

    return num_personal_pronouns, avg_word_length


corpus = clean_stop_words(fetch_web_data(random.choice(urls)),personalwords=False)

res = re.findall("[A-Za-z]+", corpus)
get_personal_pronouns(res)

(0, 6.829317269076305)

<!-- # 1. All input variables in “Input.xlsx” - columns
# 2. POSITIVE SCORE - pos_score
# 3. NEGATIVE SCORE - neg_score
# 4. POLARITY SCORE - Polarity_Score
# 5. SUBJECTIVITY SCORE - Subjectivity_Score
# 6. AVG SENTENCE LENGTH - Average_Sentence_Length
# 7. PERCENTAGE OF COMPLEX WORDS - Percentage_of_Complex_words
# 8. FOG INDEX - Fog_Index
# 9. AVG NUMBER OF WORDS PER SENTENCE - Average_Number_of_Words_Per_Sentence
# 10. COMPLEX WORD COUNT - num_complex_words
# 11. WORD COUNT - total_num_of_words
# 12. SYLLABLE PER WORD - SYLLABLE_PER_WORD
# 13. PERSONAL PRONOUNS - num_personal_pronouns
# 14. AVG WORD LENGTH - avg_word_length -->

# Main

In [11]:
id_r = []
url_r = []
pos_score_r = []
neg_score_r = []
Polarity_Score_r = []
Polarity_Score_r = []
Subjectivity_Score_r = []
Average_Sentence_Length_r = []
Percentage_of_Complex_words_r = []
Fog_Index_r = []
Average_Number_of_Words_Per_Sentence_r = []
num_complex_words_r = []
total_num_of_words_r = []
SYLLABLE_PER_WORD_r = []
num_personal_pronouns_r = []
avg_word_length_r = []


# Iterating through URLS
for n in range(len(urls)):
    try:
        # fetch_web_data
        fetched_article = fetch_web_data(urls[n])
    except:
        print(f"Page {urls[n]} Not Found....!")
        continue
    index = df.iloc[n]
    id_ = index[0]
    url_ = index[1]

    # clean_stop_words
    tokens = clean_stop_words(fetched_article)
    total_num_of_words = len(tokens)

    pos_score, neg_score, Polarity_Score, Subjectivity_Score = get_scores(tokens)

    (
        num_complex_words,
        Average_Sentence_Length,
        Percentage_of_Complex_words,
        Fog_Index,
        Average_Number_of_Words_Per_Sentence,
        SYLLABLE_PER_WORD,
    ) = Analysis_of_readability(fetched_article)

    tmp=clean_stop_words(fetched_article,personalwords=False)
    res = re.findall("[A-Za-z]+", tmp)
    num_personal_pronouns, avg_word_length = get_personal_pronouns(res)

    # Appending obtained variables into respective lists
    id_r.append(id_)
    url_r.append(url_)
    pos_score_r.append(pos_score)
    neg_score_r.append(neg_score)
    Polarity_Score_r.append(Polarity_Score)
    Subjectivity_Score_r.append(Subjectivity_Score)
    Average_Sentence_Length_r.append(Average_Sentence_Length)
    Percentage_of_Complex_words_r.append(Percentage_of_Complex_words)
    Fog_Index_r.append(Fog_Index)
    Average_Number_of_Words_Per_Sentence_r.append(Average_Number_of_Words_Per_Sentence)
    num_complex_words_r.append(num_complex_words)
    total_num_of_words_r.append(total_num_of_words)
    SYLLABLE_PER_WORD_r.append(SYLLABLE_PER_WORD)
    num_personal_pronouns_r.append(num_personal_pronouns)
    avg_word_length_r.append(avg_word_length)


output = {
    "URL_ID": id_r,
    "URL": url_r,
    "POSITIVE SCORE": pos_score_r,
    "NEGATIVE SCORE": neg_score_r,
    "POLARITY SCORE": Polarity_Score_r,
    "SUBJECTIVITY SCORE": Subjectivity_Score_r,
    "AVG SENTENCE LENGTH": Average_Sentence_Length_r,
    "PERCENTAGE OF COMPLEX WORDS": Percentage_of_Complex_words_r,
    "FOG INDEX": Fog_Index_r,
    "AVG NUMBER OF WORDS PER SENTENCE": Average_Number_of_Words_Per_Sentence_r,
    " COMPLEX WORD COUNT": num_complex_words_r,
    "WORD COUNT ": total_num_of_words_r,
    "SYLLABLE PER WORD ": SYLLABLE_PER_WORD_r,
    "PERSONAL PRONOUNS": num_personal_pronouns_r,
    "AVG WORD LENGTH": avg_word_length_r,
}

output_df = pd.DataFrame(output).set_index("URL_ID")
output_df

Page https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/ Not Found....!
Page https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/ Not Found....!


Unnamed: 0_level_0,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
URL_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,31,6,0.675676,0.074597,15.150000,0.065182,6.086073,15.150000,79,4393,1.599010,0,6.597884
blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,58,25,0.397590,0.112772,17.792683,0.056203,7.139554,17.792683,82,6884,1.904044,2,7.213253
blackassign0003,https://insights.blackcoffer.com/internet-dema...,38,22,0.266667,0.102740,18.368421,0.053486,7.368763,18.368421,56,5895,2.154728,1,8.000000
blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,34,68,-0.333333,0.178634,19.961538,0.050096,8.004654,19.961538,52,5639,2.089595,0,7.884127
blackassign0005,https://insights.blackcoffer.com/ott-platform-...,23,8,0.483871,0.089595,16.536585,0.060472,6.638823,16.536585,41,3258,1.840708,1,7.100251
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
blackassign0096,https://insights.blackcoffer.com/what-is-the-r...,28,51,-0.291139,0.143898,21.150943,0.045495,8.478575,21.150943,51,4891,1.796610,1,7.018182
blackassign0097,https://insights.blackcoffer.com/impact-of-cov...,22,32,-0.185185,0.125874,26.875000,0.036279,10.764512,26.875000,39,3587,1.560000,1,6.420833
blackassign0098,https://insights.blackcoffer.com/contribution-...,5,3,0.250000,0.036866,15.480000,0.056848,6.214739,15.480000,22,2005,1.837209,0,6.936255
blackassign0099,https://insights.blackcoffer.com/how-covid-19-...,16,2,0.777778,0.058252,18.285714,0.054688,7.336161,18.285714,35,2620,1.684375,3,6.375000


# Exporting the output_df to excel

In [12]:
output_df.to_excel("Output Data.xlsx")