In [1]:
import pandas as pd
import requests
import os
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
import syllables
import warnings
warnings.filterwarnings('ignore')

In [2]:
cwd = os.getcwd()
cwd

'C:\\Users\\adity\\OneDrive\\Desktop\\analysis'

In [3]:
df = pd.read_excel("Input.xlsx")
df.head()

Unnamed: 0,URL_ID,URL
0,37,https://insights.blackcoffer.com/ai-in-healthc...
1,38,https://insights.blackcoffer.com/what-if-the-c...
2,39,https://insights.blackcoffer.com/what-jobs-wil...
3,40,https://insights.blackcoffer.com/will-machine-...
4,41,https://insights.blackcoffer.com/will-ai-repla...


In [4]:
df.URL[0]

'https://insights.blackcoffer.com/ai-in-healthcare-to-improve-patient-outcomes/'

In [5]:
df.URL_ID.dtypes

dtype('int64')

In [6]:
stop_words = set()

stop_word_dir = '.\\stop_words'

# Iterate over all files in the stop word directory
for filename in os.listdir(stop_word_dir):
    file_path = os.path.join(stop_word_dir, filename)
    if os.path.isfile(file_path) and filename.endswith('.txt'):
        with open(file_path, 'r') as f:
            # Read the stop words from the file and add them to the set
            words = f.read().split()
            print(words)
            lowercase_words = []
            for word in words:
                lowercase_words.append(word.casefold())
            stop_words.update(lowercase_words)

print(stop_words)

['ERNST', 'YOUNG', 'DELOITTE', 'TOUCHE', 'KPMG', 'PRICEWATERHOUSECOOPERS', 'PRICEWATERHOUSE', 'COOPERS']
['AFGHANI', '|', 'Afghanistan', 'ARIARY', '|', 'Madagascar', 'BAHT', '|', 'Thailand', 'BALBOA', '|', 'Panama', 'BIRR', '|', 'Ethiopia', 'BOLIVAR', '|', 'Venezuela', 'BOLIVIANO', '|', 'Bolivia', 'CEDI', '|', 'Ghana', 'COLON', '|', 'Costa', 'Rica', 'CÓRDOBA', '|', 'Nicaragua', 'DALASI', '|', 'Gambia', 'DENAR', '|', 'Macedonia', '(Former', 'Yug.', 'Rep.)', 'DINAR', '|', 'Algeria', 'DIRHAM', '|', 'Morocco', 'DOBRA', '|', 'São', 'Tom', 'and', 'Príncipe', 'DONG', '|', 'Vietnam', 'DRAM', '|', 'Armenia', 'ESCUDO', '|', 'Cape', 'Verde', 'EURO', '|', 'Belgium', 'FLORIN', '|', 'Aruba', 'FORINT', '|', 'Hungary', 'GOURDE', '|', 'Haiti', 'GUARANI', '|', 'Paraguay', 'GULDEN', '|', 'Netherlands', 'Antilles', 'HRYVNIA', '|', 'Ukraine', 'KINA', '|', 'Papua', 'New', 'Guinea', 'KIP', '|', 'Laos', 'KONVERTIBILNA', 'MARKA', '|', 'Bosnia-Herzegovina', 'KORUNA', '|', 'Czech', 'Republic', 'KRONA', '|', 'Swe

In [7]:
def count_syllables(word):
    return syllables.estimate(word)

In [8]:
with open('positive-words.txt', 'r') as f:
    pos_words = f.read().split()
with open('negative-words.txt', 'r') as f:
    neg_words = f.read().split()
    
custom_lexicon = {}

pos_words = [word for word in pos_words if word.casefold() not in stop_words]
neg_words = [word for word in neg_words if word.casefold() not in stop_words]

for word in pos_words:
    custom_lexicon[word] = 1
    
for word in neg_words:
    custom_lexicon[word] = -1

In [9]:
def get_results(no_of_words, no_of_sentence, filtered_word_tokens, no_of_filtered_words, no_of_filtered_sentence, complex_words):
    sia = SentimentIntensityAnalyzer()
    sia.lexicon.update(custom_lexicon)

    pos_score = 0
    neg_score = 0

    for word in filtered_word_tokens:
        sentiment_scores = sia.polarity_scores(word)
        pos_score += sentiment_scores['pos']
        neg_score += sentiment_scores['neg']

    polarity_score = (pos_score-neg_score)/ ((pos_score + neg_score) + 0.000001)

    subjectivity_score = (pos_score + neg_score)/ ((no_of_filtered_words) + 0.000001)

    average_sentence_length = no_of_words/no_of_sentence

    percentage_of_complex_words = len(complex_words)/no_of_words

    fog_index = 0.4 * (average_sentence_length + percentage_of_complex_words)

    average_no_of_words_per_sentence = no_of_words/no_of_sentence
    
    print(f"pos_score : {pos_score}, neg_score : {neg_score}, polarity_score: {polarity_score}, subjectivity_score : {subjectivity_score}, average_sentence_length: {average_sentence_length}, percentage_of_complex_words : {percentage_of_complex_words}, fog_index : {fog_index}, average_no_of_words_per_sentence : {average_no_of_words_per_sentence}")
    
    return pos_score, neg_score, polarity_score, subjectivity_score, average_sentence_length, percentage_of_complex_words, fog_index, average_no_of_words_per_sentence, len(complex_words), no_of_filtered_words


In [10]:
final_results = pd.DataFrame(columns = ['URL_ID', 'URL', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE', 'SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX', 'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT', 'WORD COUNT'])

In [11]:

def tokenize(text, URL_ID, URL):
    global final_results
    word_tokens = word_tokenize(text)
    sentence_tokens = sent_tokenize(text)

    no_of_words = len(word_tokens)
    no_of_sentence = len(sentence_tokens)

    filtered_word_tokens = [word for word in word_tokens if not word.lower() in stop_words]
    filtered_sent_tokens = [word for word in sentence_tokens if not word.lower() in stop_words]

    complex_words = [word for word in word_tokens if count_syllables(word) >= 3]

    no_of_filtered_words = len(filtered_word_tokens)
    no_of_filtered_sentence = len(filtered_sent_tokens)

    ps, ns, pos, ss, asl, pocw, fi, aws, cwc, wc = get_results(no_of_words, no_of_sentence, filtered_word_tokens, no_of_filtered_words, no_of_filtered_sentence, complex_words)
    f.close()
    
    final_results = final_results.append({'URL_ID': URL_ID, 'URL': URL, 'POSITIVE SCORE' : ps, 'NEGATIVE SCORE': ns, 'POLARITY SCORE': pos, 'SUBJECTIVITY SCORE' : ss, 'AVG SENTENCE LENGTH': asl, 'PERCENTAGE OF COMPLEX WORDS': pocw, 'FOG INDEX': fi, 'AVG NUMBER OF WORDS PER SENTENCE': aws, 'COMPLEX WORD COUNT': cwc, 'WORD COUNT': wc} , ignore_index=True)

In [12]:
path = '.\\text_data\\'

for index, row in df.iterrows():
    id = row['URL_ID']
    filename = str(id) + '.txt'
    url = row['URL']
    
    try:
        response = requests.get(url)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        title = soup.find('h1', class_='entry-title').get_text()
        paragraphs = soup.find_all('p')

        with open(path + filename, mode='w', encoding='utf-8') as f:
            f.write(title + '\n')
            for p in paragraphs:
                f.write(p.get_text() + '\n')
        
        f.close()
        with open(path + filename, mode='r', encoding='utf-8') as f:
            text = f.read()
            tokenize(text, id, url)
    except requests.exceptions.RequestException as e:
        print(f'Error: {e}')

pos_score : 102.0, neg_score : 40.0, polarity_score: 0.4366197152350724, subjectivity_score : 0.12251941318160532, average_sentence_length: 26.64, percentage_of_complex_words : 0.26176176176176175, fog_index : 10.760704704704706, average_no_of_words_per_sentence : 26.64
pos_score : 95.0, neg_score : 49.0, polarity_score: 0.31944444222608026, subjectivity_score : 0.18922470408774678, average_sentence_length: 20.4875, percentage_of_complex_words : 0.16473459426479561, fog_index : 8.26089383770592, average_no_of_words_per_sentence : 20.4875
pos_score : 91.0, neg_score : 46.0, polarity_score: 0.3284671508871011, subjectivity_score : 0.13564356422213508, average_sentence_length: 22.470588235294116, percentage_of_complex_words : 0.2518324607329843, fog_index : 9.08896827841084, average_no_of_words_per_sentence : 22.470588235294116
pos_score : 90.0, neg_score : 35.0, polarity_score: 0.43999999648000004, subjectivity_score : 0.16087516066811433, average_sentence_length: 19.293478260869566, per

KeyboardInterrupt: 

In [None]:
final_results.to_csv('output.csv', index=False)