## Import required libraries 

In [1]:
import pandas as pd
import requests
import os
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
import syllables
import warnings
warnings.filterwarnings('ignore')

In [2]:
cwd = os.getcwd()
cwd

'C:\\Users\\adity\\OneDrive\\Desktop\\analysis'

## Process the data from Input.csv

In [3]:
df = pd.read_excel("Input.xlsx")
df.head()

Unnamed: 0,URL_ID,URL
0,37,https://insights.blackcoffer.com/ai-in-healthc...
1,38,https://insights.blackcoffer.com/what-if-the-c...
2,39,https://insights.blackcoffer.com/what-jobs-wil...
3,40,https://insights.blackcoffer.com/will-machine-...
4,41,https://insights.blackcoffer.com/will-ai-repla...


In [4]:
df.URL[0]

'https://insights.blackcoffer.com/ai-in-healthcare-to-improve-patient-outcomes/'

In [5]:
df.URL_ID.dtypes

dtype('int64')

# Fetch the stop words from stop_word.txt and store them into a set

In [6]:
stop_words = set()

stop_word_dir = '.\\stop_words'

for filename in os.listdir(stop_word_dir):
    file_path = os.path.join(stop_word_dir, filename)
    if os.path.isfile(file_path) and filename.endswith('.txt'):
        with open(file_path, 'r') as f:
            words = f.read().split()
            print(words)
            lowercase_words = []
            for word in words:
                lowercase_words.append(word.casefold())
            stop_words.update(lowercase_words)

print(stop_words)

['ERNST', 'YOUNG', 'DELOITTE', 'TOUCHE', 'KPMG', 'PRICEWATERHOUSECOOPERS', 'PRICEWATERHOUSE', 'COOPERS']
['AFGHANI', '|', 'Afghanistan', 'ARIARY', '|', 'Madagascar', 'BAHT', '|', 'Thailand', 'BALBOA', '|', 'Panama', 'BIRR', '|', 'Ethiopia', 'BOLIVAR', '|', 'Venezuela', 'BOLIVIANO', '|', 'Bolivia', 'CEDI', '|', 'Ghana', 'COLON', '|', 'Costa', 'Rica', 'CÓRDOBA', '|', 'Nicaragua', 'DALASI', '|', 'Gambia', 'DENAR', '|', 'Macedonia', '(Former', 'Yug.', 'Rep.)', 'DINAR', '|', 'Algeria', 'DIRHAM', '|', 'Morocco', 'DOBRA', '|', 'São', 'Tom', 'and', 'Príncipe', 'DONG', '|', 'Vietnam', 'DRAM', '|', 'Armenia', 'ESCUDO', '|', 'Cape', 'Verde', 'EURO', '|', 'Belgium', 'FLORIN', '|', 'Aruba', 'FORINT', '|', 'Hungary', 'GOURDE', '|', 'Haiti', 'GUARANI', '|', 'Paraguay', 'GULDEN', '|', 'Netherlands', 'Antilles', 'HRYVNIA', '|', 'Ukraine', 'KINA', '|', 'Papua', 'New', 'Guinea', 'KIP', '|', 'Laos', 'KONVERTIBILNA', 'MARKA', '|', 'Bosnia-Herzegovina', 'KORUNA', '|', 'Czech', 'Republic', 'KRONA', '|', 'Swe

## Count of vowels in a word

In [7]:
def count_syllables(word):
    return syllables.estimate(word)

## Make a dict of all the pos and neg words and assign scores to them 

In [8]:
with open('positive-words.txt', 'r') as f:
    pos_words = f.read().split()
with open('negative-words.txt', 'r') as f:
    neg_words = f.read().split()
    
custom_lexicon = {}

pos_words = [word for word in pos_words if word.casefold() not in stop_words]
neg_words = [word for word in neg_words if word.casefold() not in stop_words]

for word in pos_words:
    custom_lexicon[word] = 1
    
for word in neg_words:
    custom_lexicon[word] = -1

## Sentiment Analysis and result calculations

In [9]:
def get_results(word_tokens, no_of_words, no_of_sentence, filtered_word_tokens, no_of_filtered_words, no_of_filtered_sentence, complex_words):
    sia = SentimentIntensityAnalyzer()
    sia.lexicon.update(custom_lexicon)

    pos_score = 0
    neg_score = 0

    for word in filtered_word_tokens:
        sentiment_scores = sia.polarity_scores(word)
        pos_score += sentiment_scores['pos']
        neg_score += sentiment_scores['neg']

    polarity_score = (pos_score-neg_score)/ ((pos_score + neg_score) + 0.000001)

    subjectivity_score = (pos_score + neg_score)/ ((no_of_filtered_words) + 0.000001)

    average_sentence_length = no_of_words/no_of_sentence

    percentage_of_complex_words = len(complex_words)/no_of_words

    fog_index = 0.4 * (average_sentence_length + percentage_of_complex_words)

    average_no_of_words_per_sentence = no_of_words/no_of_sentence
    
    syllable_count = sum([count_syllables(word) for word in word_tokens])
        
    syllable_count_per_word = syllable_count/no_of_words
    
    total_chars = sum([len(word) for word in word_tokens])
    
    avg_word_len = total_chars/no_of_words
    
    personal_pronouns = ['I', 'we', 'my', 'mine', 'our', 'ours', 'us']
    
    count_pronoun = 0
    
    for word in word_tokens:
        if word!='US' and word.lower() in personal_pronouns:
            count_pronoun+=1
    
    return pos_score, neg_score, polarity_score, subjectivity_score, average_sentence_length, percentage_of_complex_words, fog_index, average_no_of_words_per_sentence, len(complex_words), no_of_filtered_words, syllable_count_per_word, avg_word_len, count_pronoun


In [10]:
final_results = pd.DataFrame(columns = ['URL_ID', 'URL', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE', 'SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX', 'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT', 'WORD COUNT','SYLLABLE COUNT PER WORD', 'AVG WORD LEN', 'PRONOUN COUNT'])

## Print results

In [11]:
def print_results(URL_ID, URL, pos_score, neg_score, polarity_score, subjectivity_score, average_sentence_length, percentage_of_complex_words, fog_index, average_no_of_words_per_sentence, complex_words_count, no_of_filtered_words, syllable_count_per_word, avg_word_len, count_pronoun):
    print("URL_ID: ", URL_ID)
    print("URL: ", URL)
    print("pos_score: ", pos_score)
    print("neg_score: ", neg_score)
    print("polarity_score: ", polarity_score)
    print("subjectivity_score: ", subjectivity_score)
    print("average_sentence_length: ", average_sentence_length)
    print("percentage_of_complex_words: ", percentage_of_complex_words)
    print("fog_index: ", fog_index)
    print("average_no_of_words_per_sentence: ", average_no_of_words_per_sentence)
    print("complex_words_count: ", complex_words_count)
    print("subjectivity_score: ", subjectivity_score)
    print("word_count: ", no_of_filtered_words)
    print("avg_word_len: ", avg_word_len)
    print("count_pronoun: ", count_pronoun)
    print()

## convert textual data into word tokens

In [12]:

def tokenize(text, URL_ID, URL):
    global final_results
    word_tokens = word_tokenize(text)
    sentence_tokens = sent_tokenize(text)

    no_of_words = len(word_tokens)
    no_of_sentence = len(sentence_tokens)

    filtered_word_tokens = [word for word in word_tokens if not word.lower() in stop_words]
    filtered_sent_tokens = [word for word in sentence_tokens if not word.lower() in stop_words]

    complex_words = [word for word in word_tokens if count_syllables(word) >= 3]

    no_of_filtered_words = len(filtered_word_tokens)
    no_of_filtered_sentence = len(filtered_sent_tokens)

    ps, ns, pos, ss, asl, pocw, fi, aws, cwc, wc, scpw, awl, cp = get_results(word_tokens, no_of_words, no_of_sentence, filtered_word_tokens, no_of_filtered_words, no_of_filtered_sentence, complex_words)
    
    print_results(URL_ID, URL, ps, ns, pos, ss, asl, pocw, fi, aws, cwc, wc, scpw, awl, cp)

    final_results = final_results.append({'URL_ID': URL_ID, 'URL': URL, 'POSITIVE SCORE' : ps, 'NEGATIVE SCORE': ns, 'POLARITY SCORE': pos, 'SUBJECTIVITY SCORE' : ss, 'AVG SENTENCE LENGTH': asl, 'PERCENTAGE OF COMPLEX WORDS': pocw, 'FOG INDEX': fi, 'AVG NUMBER OF WORDS PER SENTENCE': aws, 'COMPLEX WORD COUNT': cwc, 'WORD COUNT': wc, 'SYLLABLE COUNT PER WORD': scpw, 'AVG WORD LEN': awl, 'PRONOUN COUNT': cp} , ignore_index=True)

## Data Scraping using BeautifulSoup

In [13]:
path = '.\\text_data\\'

for index, row in df.iterrows():
    id = row['URL_ID']
    filename = str(id) + '.txt'
    url = row['URL']
    
    try:
        response = requests.get(url)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        title = soup.find('h1', class_='entry-title').get_text()
        paragraphs = soup.find_all('p')

        with open(path + filename, mode='w', encoding='utf-8') as f:
            f.write(title + '\n')
            for p in paragraphs:
                f.write(p.get_text() + '\n')
        
        f.close()
        with open(path + filename, mode='r', encoding='utf-8') as f:
            text = f.read()
            tokenize(text, id, url)
    except requests.exceptions.RequestException as e:
        pass

URL_ID:  37
URL:  https://insights.blackcoffer.com/ai-in-healthcare-to-improve-patient-outcomes/
pos_score:  102.0
neg_score:  40.0
polarity_score:  0.4366197152350724
subjectivity_score:  0.12251941318160532
average_sentence_length:  26.64
percentage_of_complex_words:  0.26176176176176175
fog_index:  10.760704704704706
average_no_of_words_per_sentence:  26.64
complex_words_count:  523
subjectivity_score:  0.12251941318160532
word_count:  1159
avg_word_len:  5.163163163163163
count_pronoun:  4

URL_ID:  38
URL:  https://insights.blackcoffer.com/what-if-the-creation-is-taking-over-the-creator/
pos_score:  95.0
neg_score:  49.0
polarity_score:  0.31944444222608026
subjectivity_score:  0.18922470408774678
average_sentence_length:  20.4875
percentage_of_complex_words:  0.16473459426479561
fog_index:  8.26089383770592
average_no_of_words_per_sentence:  20.4875
complex_words_count:  270
subjectivity_score:  0.18922470408774678
word_count:  761
avg_word_len:  4.293471629042099
count_pronoun: 

URL_ID:  54
URL:  https://insights.blackcoffer.com/all-you-need-to-know-about-online-marketing/
pos_score:  36.0
neg_score:  1.0
polarity_score:  0.94594592037984
subjectivity_score:  0.08545034622297841
average_sentence_length:  18.88888888888889
percentage_of_complex_words:  0.23647058823529413
fog_index:  7.6501437908496746
average_no_of_words_per_sentence:  18.88888888888889
complex_words_count:  201
subjectivity_score:  0.08545034622297841
word_count:  433
avg_word_len:  4.621176470588235
count_pronoun:  0

URL_ID:  55
URL:  https://insights.blackcoffer.com/evolution-of-advertising-industry/
pos_score:  26.0
neg_score:  5.0
polarity_score:  0.677419332986473
subjectivity_score:  0.07013574644765669
average_sentence_length:  20.175
percentage_of_complex_words:  0.265179677819083
fog_index:  8.176071871127634
average_no_of_words_per_sentence:  20.175
complex_words_count:  214
subjectivity_score:  0.07013574644765669
word_count:  442
avg_word_len:  4.884758364312268
count_pronoun:  1

URL_ID:  71
URL:  https://insights.blackcoffer.com/will-technology-eliminate-the-need-for-animal-testing-in-drug-development/
pos_score:  10.0
neg_score:  27.0
polarity_score:  -0.4594594470416366
subjectivity_score:  0.22981366316885923
average_sentence_length:  18.90909090909091
percentage_of_complex_words:  0.15144230769230768
fog_index:  7.624213286713287
average_no_of_words_per_sentence:  18.90909090909091
complex_words_count:  63
subjectivity_score:  0.22981366316885923
word_count:  161
avg_word_len:  4.336538461538462
count_pronoun:  7

URL_ID:  72
URL:  https://insights.blackcoffer.com/will-we-ever-understand-the-nature-of-consciousness/
pos_score:  45.0
neg_score:  49.0
polarity_score:  -0.04255319103666818
subjectivity_score:  0.0919765165440543
average_sentence_length:  26.64864864864865
percentage_of_complex_words:  0.23073022312373226
fog_index:  10.751751548708953
average_no_of_words_per_sentence:  26.64864864864865
complex_words_count:  455
subjectivity_score:  0.0919765

URL_ID:  87
URL:  https://insights.blackcoffer.com/impact-of-covid-19-pandemic-on-tourism-aviation-industries/
pos_score:  29.0
neg_score:  51.0
polarity_score:  -0.27499999656250007
subjectivity_score:  0.11527377505003779
average_sentence_length:  24.642857142857142
percentage_of_complex_words:  0.19492753623188405
fog_index:  9.935113871635611
average_no_of_words_per_sentence:  24.642857142857142
complex_words_count:  269
subjectivity_score:  0.11527377505003779
word_count:  694
avg_word_len:  4.583333333333333
count_pronoun:  7

URL_ID:  88
URL:  https://insights.blackcoffer.com/impact-of-covid-19-pandemic-on-sports-events-around-the-world/
pos_score:  113.0
neg_score:  125.0
polarity_score:  -0.050420167855377444
subjectivity_score:  0.10818181813264463
average_sentence_length:  20.384615384615383
percentage_of_complex_words:  0.1552201257861635
fog_index:  8.215934204160618
average_no_of_words_per_sentence:  20.384615384615383
complex_words_count:  617
subjectivity_score:  0.1081

URL_ID:  103
URL:  https://insights.blackcoffer.com/what-is-the-repercussion-of-the-environment-due-to-the-covid-19-pandemic-situation/
pos_score:  12.0
neg_score:  30.0
polarity_score:  -0.4285714183673472
subjectivity_score:  0.10370370344764518
average_sentence_length:  20.771428571428572
percentage_of_complex_words:  0.1939477303988996
fog_index:  8.38615052073099
average_no_of_words_per_sentence:  20.771428571428572
complex_words_count:  141
subjectivity_score:  0.10370370344764518
word_count:  405
avg_word_len:  4.491059147180192
count_pronoun:  7

URL_ID:  104
URL:  https://insights.blackcoffer.com/what-is-the-repercussion-of-the-environment-due-to-the-covid-19-pandemic-situation-2/
pos_score:  41.0
neg_score:  76.0
polarity_score:  -0.29914529658850175
subjectivity_score:  0.18810289358825902
average_sentence_length:  24.28
percentage_of_complex_words:  0.23393739703459637
fog_index:  9.805574958813839
average_no_of_words_per_sentence:  24.28
complex_words_count:  284
subjectiv

URL_ID:  119
URL:  https://insights.blackcoffer.com/coronavirus-impact-on-the-hospitality-industry-4/
pos_score:  12.0
neg_score:  16.0
polarity_score:  -0.14285713775510223
subjectivity_score:  0.13023255753380206
average_sentence_length:  25.066666666666666
percentage_of_complex_words:  0.23404255319148937
fog_index:  10.120283687943264
average_no_of_words_per_sentence:  25.066666666666666
complex_words_count:  88
subjectivity_score:  0.13023255753380206
word_count:  215
avg_word_len:  4.949468085106383
count_pronoun:  1

URL_ID:  120
URL:  https://insights.blackcoffer.com/why-scams-like-nirav-modi-happen-with-indian-banks/
pos_score:  87.0
neg_score:  110.0
polarity_score:  -0.11675126844288697
subjectivity_score:  0.14969604851846804
average_sentence_length:  27.136363636363637
percentage_of_complex_words:  0.20979899497487436
fog_index:  10.938465052535406
average_no_of_words_per_sentence:  27.136363636363637
complex_words_count:  501
subjectivity_score:  0.14969604851846804
word_

URL_ID:  135
URL:  https://insights.blackcoffer.com/should-people-wear-fabric-gloves-seeking-evidence-regarding-the-differential-transfer-of-covid-19-or-coronaviruses-generally-between-surfaces/
pos_score:  8.0
neg_score:  9.0
polarity_score:  -0.05882352595155729
subjectivity_score:  0.10119047558815193
average_sentence_length:  23.9375
percentage_of_complex_words:  0.12532637075718014
fog_index:  9.625130548302872
average_no_of_words_per_sentence:  23.9375
complex_words_count:  48
subjectivity_score:  0.10119047558815193
word_count:  168
avg_word_len:  4.164490861618799
count_pronoun:  3

URL_ID:  136
URL:  https://insights.blackcoffer.com/why-is-there-a-severe-immunological-and-inflammatory-explosion-in-those-affected-by-sarms-covid-19/
pos_score:  7.0
neg_score:  20.0
polarity_score:  -0.48148146364883465
subjectivity_score:  0.13636363567493112
average_sentence_length:  21.714285714285715
percentage_of_complex_words:  0.18859649122807018
fog_index:  8.761152882205513
average_no_of

In [14]:
final_results.head()

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE COUNT PER WORD,AVG WORD LEN,PRONOUN COUNT
0,37,https://insights.blackcoffer.com/ai-in-healthc...,102.0,40.0,0.43662,0.122519,26.64,0.261762,10.760705,26.64,523,1159,1.874875,5.163163,4
1,38,https://insights.blackcoffer.com/what-if-the-c...,95.0,49.0,0.319444,0.189225,20.4875,0.164735,8.260894,20.4875,270,761,1.589384,4.293472,8
2,39,https://insights.blackcoffer.com/what-jobs-wil...,91.0,46.0,0.328467,0.135644,22.470588,0.251832,9.088968,22.470588,481,1010,1.832461,4.896335,3
3,40,https://insights.blackcoffer.com/will-machine-...,90.0,35.0,0.44,0.160875,19.293478,0.172958,7.786574,19.293478,307,777,1.64507,4.420282,22
4,41,https://insights.blackcoffer.com/will-ai-repla...,89.0,31.0,0.483333,0.125654,25.103896,0.193999,10.119158,25.103896,375,955,1.685463,4.578376,20


In [15]:
final_results.to_csv('output.csv', index=False)