In [9]:
#loading all neccessary modules
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup as BS

from nltk import re, word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import SyllableTokenizer
from textstat.textstat import textstatistics

In [10]:
# opening the file
data=pd.read_excel('Input.xlsx')

In [None]:
# this function crawls the websites and extract the data into .txt files
def text_extraction(dataframe):
    df=dataframe
    for i in range(0,len(df)):
        url=df.iloc[i,1]
        filename=df.iloc[i,0]
        response=requests.get(url)
        parsed=BS(response.text,'html.parser')
        art=str(parsed.article)
        if art!='None':
            para=BS(art,'html.parser').find_all('p')
            title=parsed.title.text
            txt=BS(str(para),'html.parser').text
            title=title.replace('- Blackcoffer Insights','')
            article=title+'.\n'+txt

            f=open(f'{filename}.txt','wb')
            f.write(article.encode('utf-8'))
            f.close()
        else:
            f=open(f'{filename}.txt','w')
            f.close()

In [None]:
#to crawl the web pages and extract the data into a text file
text_extraction(data)

In [None]:
stop_words_files=['StopWords_Auditor.txt','StopWords_Currencies.txt','StopWords_DatesandNumbers.txt',
                  'StopWords_Generic.txt','StopWords_GenericLong.txt','StopWords_Geographic.txt','StopWords_Names.txt']

stop_words=''
for x in stop_words_files:  # stopwords are extracted from given files
    with open(x) as stop:
        txt=stop.read()
        stop.close()
        stop_words=stop_words+txt

stop_words_list=stop_words.lower().replace('|','').split()
stop_words=list(set(stop_words_list+stopwords.words('english')))

In [None]:
#negative and positive words are extracted from given files
with open('positive-words.txt') as ps:
    ps_word=ps.read()
    ps.close()
with open('negative-words.txt') as ng:
    ng_word=ng.read()
    ng.close()

pos_words_list=ps_word.split()
neg_words_list=ng_word.split()

In [None]:
#this function cleans the text and generate tokens(nltk module is used for natural language processing)
def tokens(text, stopwords):
    stop_words= stopwords
    txt=re.sub(r'[^a-zA-Z\s]',' ',text,).replace('US','united states').replace('IT','Information Technology').lower()
    tokens=word_tokenize(txt)
    filtered_words=[word for word in tokens if word not in stop_words]
    return filtered_words

In [None]:
# sentimental analysis is done using this function
def sentimental_anlysis(tokens,pos,neg):
    positive_words=pos
    negative_words=neg

    pos_score=len([word for word in tokens if word in positive_words])   #1
    neg_score=len([word for word in tokens if word in negative_words])    #2
    pol_score=round((pos_score - neg_score)/ ((pos_score + neg_score) + 0.000001),2)  #3
    sub_score=round((pos_score + neg_score)/ (len(tokens) + 0.000001),2)   #4

    return [pos_score,neg_score,pol_score,sub_score]

In [None]:
#text analysis for readability is done using nltk and textstat modules
def Text_analysis(text):

    new_text=re.sub(r'[^.a-zA-Z\s]','',text)

    sentences=new_text.split('.')
    total_sentences=len(sentences)
    words=new_text.replace('.','').split()
    total_words=len(words)

    syllable_count={
        x: textstatistics().syllable_count(text=x) for x in words
    }
    complex_words=[]
    for x in words:
        if syllable_count[x]>2:
            complex_words.append(x)
    COMPLEX_WORD_COUNT=len(complex_words)  #9

    AVG_SENTENCE_LENGTH=textstatistics().avg_sentence_length(new_text)  #5
    PERCENTAGE_OF_COMPLEX_WORDS=round((COMPLEX_WORD_COUNT / total_words)*100,2)  #6
    FOG_INDEX=round(0.4 * (AVG_SENTENCE_LENGTH + PERCENTAGE_OF_COMPLEX_WORDS),2) #7

    AVG_NUMBER_OF_WORDS_PER_SENTENCE= round(total_words / total_sentences,2)  #8

    WORD_COUNT=len([x for x in words if x.lower() not in stopwords.words('english')]) #10

    num=0
    for x in words:
        num=num+textstatistics().syllable_count(x)
    SYLLABLE_PER_WORD= round(num/total_words,2)  #11

    PP=re.findall(r' I | you[rs] | he | she | it | we | they | me | him | her | us | them ', new_text)
    PERSONAL_PRONOUNS=len(PP)   #12

    AVG_WORD_LENGTH=textstatistics().avg_character_per_word(new_text)  #13

    return [AVG_SENTENCE_LENGTH, PERCENTAGE_OF_COMPLEX_WORDS, FOG_INDEX, AVG_NUMBER_OF_WORDS_PER_SENTENCE,
            COMPLEX_WORD_COUNT, WORD_COUNT, SYLLABLE_PER_WORD, PERSONAL_PRONOUNS, AVG_WORD_LENGTH]

In [None]:
columns=['URL_ID','POSITIVE_SCORE','NEGATIVE_SCORE','POLARITY_SCORE','SUBJECTIVITY_SCORE',
    'AVG_SENTENCE_LENGTH','PERCENTAGE_OF_COMPLEX_WORDS','FOG_INDEX','AVG_NUMBER_OF_WORDS_PER_SENTENCE',
    'COMPLEX_WORD_COUNT','WORD_COUNT','SYLLABLE_PER_WORD','PERSONAL_PRONOUNS','AVG_WORD_LENGTH']
# an empty datagrame is created for storing furthur calculated data
output=pd.DataFrame(columns=columns)

In [None]:
# all functions are applied here to fill the empty dataframe with calculated data
for i in data['URL_ID']:
    f=open(f'{i}.txt','rb')
    text=f.read().decode()
    f.close()

    if text != '':
        tk=tokens(text,stop_words)
        SA=sentimental_anlysis(tk,pos_words_list,neg_words_list)
        TA=Text_analysis(text)
        output.loc[len(output.index)]=[i]+SA+TA

    else:
        output.loc[len(output.index)]=[i]+[0,0,0,0,0,0,0,0,0,0,0,0,0]


In [None]:
Output_data=pd.merge(data,output,on='URL_ID')
#the data is now stored in an excel file 'Output_data.xlsx'
Output_data.to_excel('Output_data.xlsx')

In [None]:
Output_data

Unnamed: 0,URL_ID,URL,POSITIVE_SCORE,NEGATIVE_SCORE,POLARITY_SCORE,SUBJECTIVITY_SCORE,AVG_SENTENCE_LENGTH,PERCENTAGE_OF_COMPLEX_WORDS,FOG_INDEX,AVG_NUMBER_OF_WORDS_PER_SENTENCE,COMPLEX_WORD_COUNT,WORD_COUNT,SYLLABLE_PER_WORD,PERSONAL_PRONOUNS,AVG_WORD_LENGTH
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,2,1,0.33,0.02,14.0,11.68,10.27,12.54,41,182,1.49,2,4.62
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,49,31,0.22,0.13,17.5,21.66,15.66,17.27,303,835,1.76,15,5.55
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...,36,25,0.18,0.10,18.6,29.08,19.07,18.26,308,664,2.02,15,6.18
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,35,74,-0.36,0.19,20.0,28.23,19.29,19.58,293,657,1.95,11,6.04
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...,21,9,0.40,0.09,17.0,19.12,14.45,16.19,130,410,1.72,7,5.59
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,blackassign0096,https://insights.blackcoffer.com/what-is-the-r...,25,54,-0.37,0.15,21.8,19.87,16.67,20.98,221,642,1.68,4,5.30
96,blackassign0097,https://insights.blackcoffer.com/impact-of-cov...,21,35,-0.25,0.15,28.2,13.23,16.57,26.17,142,524,1.50,26,4.74
97,blackassign0098,https://insights.blackcoffer.com/contribution-...,1,0,1.00,0.01,24.7,20.95,18.26,24.67,31,99,1.69,1,5.36
98,blackassign0099,https://insights.blackcoffer.com/how-covid-19-...,11,3,0.57,0.06,17.7,10.34,11.22,17.16,55,305,1.49,6,4.84


In [7]:
pip install textstat


Defaulting to user installation because normal site-packages is not writeable
Collecting textstat
  Downloading textstat-0.7.4-py3-none-any.whl.metadata (14 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.17.0-py3-none-any.whl.metadata (3.2 kB)
Downloading textstat-0.7.4-py3-none-any.whl (105 kB)
Downloading pyphen-0.17.0-py3-none-any.whl (2.1 MB)
   ---------------------------------------- 0.0/2.1 MB ? eta -:--:--
   ----- ---------------------------------- 0.3/2.1 MB ? eta -:--:--
   --------------- ------------------------ 0.8/2.1 MB 2.2 MB/s eta 0:00:01
   ------------------------- -------------- 1.3/2.1 MB 2.2 MB/s eta 0:00:01
   ----------------------------------- ---- 1.8/2.1 MB 2.2 MB/s eta 0:00:01
   ---------------------------------------- 2.1/2.1 MB 2.2 MB/s eta 0:00:00
Installing collected packages: pyphen, textstat
Successfully installed pyphen-0.17.0 textstat-0.7.4
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: C:\Users\Admin\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip
