<a href="https://colab.research.google.com/github/AishwaryaSushant/WebScraping_and_TextAnalysis_NLP/blob/main/Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Code by Aishwarya Sushant


In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import string
import re

# Load input data
input_df = pd.read_excel('/content/Input.xlsx')

# Initialize empty DataFrame for output
output_df = pd.DataFrame(columns=['URL_ID', 'URL', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE', 'SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX', 'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT', 'WORD COUNT', 'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH'])

# Create a new dictionary to store the data
data_dict = {}

# Initialize the NLTK tokenizer and download required resources
nltk.download('punkt')

# Function to calculate the FOG Index
def fog_index(avg_sentence_length, percentage_of_complex_words):
    return 0.4 * (avg_sentence_length + percentage_of_complex_words)

# Function to calculate the percentage of complex words
def percentage_complex_words(complex_word_count, word_count):
    return (complex_word_count / word_count) * 100

# Function to count syllables in a word
def count_syllables(word):
    vowels = "aeiouAEIOU"
    count = 0
    prev_char = ""
    for char in word:
        if char in vowels and prev_char not in vowels:
            count += 1
        prev_char = char
    if word.endswith(('es', 'ed')):
        count -= 1
    return max(count, 1)

# Read positive and negative words from text files
def read_words_from_file(file_path):
    words = []
    with open(file_path, encoding='ISO-8859-1') as f:
        for line in f:
            word = line.strip()
            words.append(word)
    return words

# Open the .txt files containing stopwords
stopword_paths = ['/content/StopWords_Auditor.txt', '/content/StopWords_Currencies.txt', '/content/StopWords_DatesandNumbers.txt', '/content/StopWords_Generic.txt', '/content/StopWords_GenericLong.txt', '/content/StopWords_Names.txt']

# Read stopwords from the .txt files and create a set for faster lookup
SW = set()
for stopword_path in stopword_paths:
    SW.update(read_words_from_file(stopword_path))

# Initialize the Master_dictionary
Master_dictionary = {}

# Open the .txt files containing positive and negative words
positive_words_path = '/content/positive-words.txt'
negative_words_path = '/content/negative-words.txt'

# Read positive and negative words from the .txt files
positive_words = read_words_from_file(positive_words_path)
negative_words = read_words_from_file(negative_words_path)

# Add positive words to Master_dictionary if they are not in the stopwords list
for word in positive_words:
    if word not in SW:
        Master_dictionary[word] = 'positive'

# Add negative words to Master_dictionary if they are not in the stopwords list
for word in negative_words:
    if word not in SW:
        Master_dictionary[word] = 'negative'

# Function to calculate personal pronouns
def count_personal_pronouns(text):
    personal_pronouns = ['I', 'me', 'my', 'mine', 'we', 'us', 'our', 'ours']
    count = 0
    for pronoun in personal_pronouns:
        count += len(re.findall(r'\b' + re.escape(pronoun) + r'\b', text, re.IGNORECASE))
    return count

# Loop through each URL in input data
for index, row in input_df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']

    # Send a GET request to the URL
    response = requests.get(url)
    html = response.text

    # Parse HTML with BeautifulSoup and extract the article text
    soup = BeautifulSoup(html, 'html.parser')

    article_element = soup.find('article')  # Locate the <article> element

    title = []
    para = []

    # Check if the article element is present
    if article_element is not None:
        # Extract the title of the web page
        title = soup.title.text

        # Extract the text from the paragraph elements
        para1 = soup.find_all(class_='td-post-content tagdiv-type')
        para2 = soup.find_all(class_='tdb-block-inner td-fix-index')

        para1_text = []
        for paragraph in para1:
            para1_text.append(paragraph.text)

        para2_text = []
        for paragraph2 in para2:
            para2_text.append(paragraph2.text)

        para = para1_text + para2_text

        # Tokenize the text using NLTK
        tokens = word_tokenize(' '.join(para))

        # Calculate Positive Score, Negative Score, and other variables here
        # Initialize scores and other variables
        positive_score = 0
        negative_score = 0
        total_words = len(tokens)
        total_sentences = len(sent_tokenize(' '.join(para)))

        complex_word_count = 0
        personal_pronouns = 0
        total_syllables = 0

        # Calculate Positive and Negative Scores based on Master_dictionary
        for token in tokens:
            if token in Master_dictionary:
                if Master_dictionary[token] == 'positive':
                    positive_score += 1
                elif Master_dictionary[token] == 'negative':
                    negative_score += 1

        # Calculate Polarity Score and Subjectivity Score
        polarity_score = (positive_score - negative_score) / (positive_score + negative_score + 0.000001)
        subjectivity_score = (positive_score + negative_score) / (total_words + 0.000001)

        # Calculate AVG SENTENCE LENGTH
        avg_sentence_length = total_words / total_sentences

        # Calculate PERCENTAGE OF COMPLEX WORDS
        for token in tokens:
            if token.lower() not in nltk.corpus.words.words() and token not in string.punctuation:
                complex_word_count += 1
                if token.lower() in ['i', 'me', 'my', 'mine', 'we', 'us', 'our', 'ours']:
                    personal_pronouns += 1

                # Calculate SYLLABLE PER WORD
                syllables = count_syllables(token)
                total_syllables += syllables
                break

        # Calculate FOG INDEX
        fog = fog_index(avg_sentence_length, percentage_complex_words(complex_word_count, total_words))

        # Calculate AVG NUMBER OF WORDS PER SENTENCE
        avg_words_per_sentence = total_words / total_sentences

        # Calculate AVG WORD LENGTH
        avg_word_length = total_syllables / total_words

        # Append the url_id, url, title, and paragraph text to the dictionary
        data_dict[url_id] = {
            'url': url,
            'title': title,
            'article_text': para,
            'POSITIVE SCORE': positive_score,
            'NEGATIVE SCORE': negative_score,
            'POLARITY SCORE': polarity_score,
            'SUBJECTIVITY SCORE': subjectivity_score,
            'AVG SENTENCE LENGTH': avg_sentence_length,
            'PERCENTAGE OF COMPLEX WORDS': percentage_complex_words(complex_word_count, total_words),
            'FOG INDEX': fog,
            'AVG NUMBER OF WORDS PER SENTENCE': avg_words_per_sentence,
            'COMPLEX WORD COUNT': complex_word_count,
            'WORD COUNT': total_words,
            'SYLLABLE PER WORD': avg_word_length,
            'PERSONAL PRONOUNS': personal_pronouns,
            'AVG WORD LENGTH': avg_word_length
        }

    else:
        # Print an error message if the article element is not present
        print("Error: Article not available")

counter = 1
print(counter)
counter+1

# Convert the data_dict to a DataFrame and save it
output_df = pd.DataFrame.from_dict(data_dict, orient='index')
output_df.to_excel('/content/Output.xlsx', index=False)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Error: Article not available
Error: Article not available
1


In [None]:
import nltk
nltk.download('words')


[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True