<a href="https://colab.research.google.com/github/GSP31/GSP31/blob/main/Textual_metrics_analysis_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install requests beautifulsoup4



In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import scipy as sp
import requests
import re
import string
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

%matplotlib inline
import nltk
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# read input file
input_file = pd.read_excel('Input.xlsx')

In [None]:
# define stop words lists
stopwords_auditor = set(open('StopWords_Auditor.txt').read().split())
stopwords_currencies = set(open('StopWords_Currencies.txt', encoding='ISO-8859-1').read().split())
stopwords_datesandnumbers = set(open('StopWords_DatesandNumbers.txt').read().split())
stopwords_generic = set(open('StopWords_Generic.txt').read().split())
stopwords_genericlong = set(open('StopWords_GenericLong.txt').read().split())
stopwords_geographic = set(open('StopWords_Geographic.txt').read().split())
stopwords_names = set(open('StopWords_Names.txt').read().split())
stopwords = stopwords_auditor.union(stopwords_currencies, stopwords_datesandnumbers, stopwords_generic, stopwords_genericlong, stopwords_geographic, stopwords_names)

In [None]:
# define positive and negative words lists
positive_words = set(open('positive-words.txt').read().split())
negative_words = set(open('negative-words.txt', encoding='ISO-8859-1').read().split())


In [None]:
# define function to calculate syllable count per word
def syllable_count(word):
    vowels = "aeiou"
    count = 0
    if word[0] in vowels:
        count += 1
    for i in range(1, len(word)):
        if word[i] in vowels and word[i-1] not in vowels:
            count += 1
    if word.endswith(('es', 'ed')):
        count -= 1
    if count == 0:
        count += 1
    return count

In [None]:
# loop through the URLs and compute variables
output = []
for index, row in input_file.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    # check if the article tag is present
    if not soup.find_all('article'):
        print(f'Article not found for URL {url}')
        continue
    # extract article text
    article_text = ''
    for p in soup.find_all('article')[0].find_all('p'):
        article_text += p.get_text().strip() + ' '
    # save article text to file
    with open(f'{url_id}.txt', 'w') as f:
        f.write(article_text)
    # clean text using stop words lists
    cleaned_text = ''
    for word in word_tokenize(article_text):
        word = word.lower()
        if word not in stopwords and word not in string.punctuation:
            cleaned_text += word + ' '
    # create dictionary of positive and negative words
    pos_words = [w for w in word_tokenize(cleaned_text) if w in positive_words]
    neg_words = [w for w in word_tokenize(cleaned_text) if w in negative_words]
    # calculate positive and negative scores
    pos_score = len(pos_words)
    neg_score = len(neg_words)
    # calculate polarity and subjectivity scores
    polarity_score = (pos_score - neg_score) / ((pos_score + neg_score) + 0.000001)
    subj_score = (pos_score + neg_score) / ((len(word_tokenize(cleaned_text))) + 0.000001)
    # calculate average sentence length, percentage of complex words, and fog index
    sentences = sent_tokenize(article_text)
    num_sentences = len(sentences)
    num_words = len(word_tokenize(cleaned_text))
    avg_sent_len = num_words / num_sentences
    num_complex_words = len([word for word in word_tokenize(cleaned_text) if syllable_count(word) > 2])
    percent_complex_words = (num_complex_words / num_words)*100
    fog_index = 0.4 * (avg_sent_len + percent_complex_words)
    # calculate average number of words per sentence
    avg_words_per_sent = num_words / num_sentences
    #Although im not getting whats difference between average sentence length and average no of words per sentence it seems same to me"

    # calculate complex word count
    complex_word_count = num_complex_words
    # calculate word count
    word_count = len(word_tokenize(cleaned_text))
    # calculate syllable per word
    syll_per_word = sum(syllable_count(word) for word in word_tokenize(cleaned_text)) / word_count
    # calculate personal pronouns count
    personal_pronouns = len(re.findall(r'\b(I|we|my|ours|us)\b', cleaned_text))
    # calculate average word length
    word_lengths = [len(word) for word in word_tokenize(cleaned_text)]
    avg_word_len = sum(word_lengths) / len(word_tokenize(cleaned_text))
    # append results to output list
    output.append({
        'URL_ID': url_id,
        'URL': url,
        'Positive Score': pos_score,
        'Negative Score': neg_score,
        'Polarity Score': polarity_score,
        'Subjectivity Score': subj_score,
        'Average Sentence Length': avg_sent_len,
        'Percentage of Complex Words': percent_complex_words,
        'Fog Index': fog_index,
        'Average Number of Words per Sentence': avg_words_per_sent,
        'Complex Word Count': complex_word_count,
        'Word Count': word_count,
        'Syllables per Word': syll_per_word,
        'Personal Pronouns': personal_pronouns,
        'Average Word Length': avg_word_len
    })

Article not found for URL https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/
Article not found for URL https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/
Article not found for URL https://insights.blackcoffer.com/ensuring-growth-through-insurance-technology/


In [None]:
# create output file
output_file = pd.DataFrame(output)
output_file.to_excel('Output.xlsx', index=False)