In [12]:
#pip install openpyxl
#pip install selenium
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('wordnet')
#nltk.download('stopwords')


    

In [13]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import os
import nltk
from nltk.tokenize import word_tokenize
from nltk import SyllableTokenizer 
from nltk.corpus import stopwords
from openpyxl import Workbook
from openpyxl import load_workbook
import re
import string
from selenium.common.exceptions import WebDriverException
import requests


In [14]:
def count_words(tokens):
    """
    Count tokens in the given text after removing stop words and punctuation marks.

    Args:
    - text (str): The input text.

    Returns:
    - int: The count of tokens after preprocessing.
    """
    # Get the set of English stopwords
    stop_words = set(stopwords.words('english'))

    # Remove punctuation marks and stop words, and count tokens
    count = 0
    for token in tokens:
        # Remove punctuation marks
        token = token.translate(str.maketrans('', '', string.punctuation))

        # Check if the token is not a stop word and not an empty string after removing punctuation
        if token.lower() not in stop_words and token != '':
            count += 1

    return count

In [15]:

def write_record_to_excel(wb, record):
    """
    Write a record to the Excel file.

    Args:
    - wb (openpyxl.Workbook): The Workbook object.
    - record (tuple): The record to write to the file.

    Returns:
    - None
    """
    ws = wb.active
    ws.append(record)

In [16]:
def create_excel_file(file_path, headers):
    """
    Create an Excel file with the given file path and headers.

    Args:
    - file_path (str): The path to the Excel file.
    - headers (list of str): The list of headers.

    Returns:
    - None
    """
    wb = Workbook()
    ws = wb.active
    ws.append(headers)
    wb.save(file_path)

In [29]:
def create_text_file(url,output_text_file):
    """
    Copy text content of url webpage to text file .

    Args:
    - url(str) : The url string.
    - output_text_file (str): The name of  text file.

    Returns:
    - None
    """
    driver = webdriver.Chrome()  
    
    # Open the URL in the webdriver
    driver.get(url)
    
    # Extract article heading
    heading_element = driver.find_element(By.TAG_NAME, "h1")  
    article_heading = heading_element.text    
    
    # Extract article content
    content_elements = driver.find_elements(By.CSS_SELECTOR,"div.td-post-content");
    article_content = '\n'.join([element.text for element in content_elements])
    #print(article_content)
    
    # Write the article heading and content to the file
    with open(output_text_file, "w",encoding="cp437", errors='ignore') as file:
        file.write(f"{article_heading}")
        file.write(f"\n{article_content}")

    # Close the webdriver
    driver.quit()

In [18]:
def get_stopwords(stopwords_folder_path):
    """
    Read list of stopwatch from files stored in folder 
    
    Args:
    - stopwords_folder_path(str) : The url string.
    - stopwords_folder_path (str): The folder having files with stopwords.

    Returns:
    - stop_words_list(list): list of stop words
    """
    
    #get list of stopwatch from files stored in folder stopwords_folder_path
    
    # Initialize an empty list to store words
    words_list = []

    # Iterate over each file in the folder
    for filename in os.listdir(stopwords_folder_path):
        # Check if the file is a text file
        if filename.endswith(".txt"):
            file_path = os.path.join(stopwords_folder_path, filename)
            # Open the file in read mode
            with open(file_path, "r") as file:
                # Read one line at a time from the file
                for line in file:
                # Split each line into words
                    words = line.split()
                    # Extend the words_list with the words from the line
                    words_list.extend(words)
    stop_words_list=words_list
    return(stop_words_list)

In [19]:
def tokanize_text(output_text_file):
    """
    Tokanize the text stored in the file. 
    
    Args:
    - output_text_file(str) : The source file name.
    
    Returns:
    - tokens(list): list of tokens.
    """
    
    #tokanize the content of text file
    
    # Read the content of the text file
    with open(output_text_file, 'r',encoding="cp437", errors='ignore') as file:
        text = file.read()
    

    # Tokenize the text using NLTK's word_tokenize function
    tokens = word_tokenize(text)
    # Print the list of tokens
    #print(tokens)
    return(tokens)

In [20]:
def sylabble_analysis(tokens):
    """
    find and count syllable present in each word excluding word ending with 'es', 'ed'. 
    
    Args:
    - tokens(list) : The list of tokens.
    
    Returns:
    - percentage_of_complex_words(list):The total number of complex words / total number of words in a txt file.
    - complex_word_count(int) : The number of words having more than 2 syllables 
    - syllable_per_word(float) :  The number of syllable per word
    """
    
    tk = SyllableTokenizer() 
    complex_word_count = 0
    syllable_count=0
    percentage_of_complex_words=0
    syllable_per_word=0

    if len(tokens) > 0 :
        for word in tokens:
            # Check if the word ends with "es" or "ed"
            if not word.endswith(("es", "ed")):
                syllables = tk.tokenize(word)

                #increae syllable count by number of syllables present in the word
                syllable_count = syllable_count + len(syllables)

                #inccrease comple word count if number of syllables is greater than 2
                if len(syllables) > 2:
                    complex_word_count += 1

        syllable_per_word=syllable_count/len(tokens)
        percentage_of_complex_words=complex_word_count/len(tokens)*100
    return([percentage_of_complex_words,complex_word_count,syllable_per_word])

In [21]:
def personal_pronouns(output_text_file):
    """
    find personal pronouns I,we,my,ours as  and count personal pronouns. 
    
    Args:
    - output_text_file(str) : The name of text file.
    
    Returns:
    - pronoun_count(int): count of personal pronouns.
    """

    with open(output_text_file, 'r',encoding="cp437", errors='ignore') as file:
        text = file.read()
    # Define regex pattern for personal pronouns
    pronoun_pattern = r'\b(?:I|we|my|ours)\b'

    # Compile the regex pattern
    pronoun_regex = re.compile(pronoun_pattern, re.IGNORECASE)

    # Count occurrences of personal pronouns
    pronoun_count = len(re.findall(pronoun_regex, text))

    # Exclude occurrences of 'us' when it refers to the country name
    country_pattern = r'\b(?:the\s+)?US\b'
    country_regex = re.compile(country_pattern, re.IGNORECASE)
    country_count = len(re.findall(country_regex, text))

    # Subtract the occurrences of 'us' referring to the country name
    pronoun_count -= country_count
    return(pronoun_count)

In [22]:
def calculate_average_word_length(tokens):
    """
    find Average Word Length = Sum of the total number of characters in each word/Total number of words. 
    
    Args:
    - tokens(list) : The list of tokens.
    
    Returns:
    - average_word_length(float): Average Word Length i.e. total number of characters/total number of tokens.
    """

    total_characters = sum(len(token) for token in tokens)

    # Calculate the total number of words
    total_words = len(tokens)

    # Calculate the average word length
    if total_words > 0:
        average_word_length = total_characters / total_words
    else:
        average_word_length = 0  # Avoid division by zero if text is empty

    return(average_word_length)

In [23]:
def calculate_average_sentence_length(output_text_file):
    """
    find Average Sentence Length = the number of words / the number of sentences.
    
    Args:
    - output_text_file(str) : The name of text file.
    
    Returns:
    - average_sentence_length(float): Average Sentence Length i.e. total number of tokens/total number of sentences.
    """
    average_sentence_length=0
    # Read the content of the text file    
    with open(output_text_file, 'r',encoding="cp437", errors='ignore') as file:
        text = file.read()
    sentences = nltk.sent_tokenize(text)
    number_of_sentences=len(sentences)
    if len(sentences) > 0 :
        average_sentence_length = len(tokens)/len(sentences)
    return(average_sentence_length)
    

In [24]:
def calculate_average_word_per_sentence(output_text_file):
    """
    find Average Number of Words Per Sentence = the total number of tokens / total number of sentences.
    
    Args:
    - output_text_file(str) : The name of text file.
    
    Returns:
    - average_sentence_length(float): Average Number of Words Per Sentence, total number of sentence/total number of tokens
    """
    average_word_per_sentence=0
    # Read the content of the text file
    with open(output_text_file, 'r',encoding="cp437", errors='ignore') as file:
        text = file.read()
    sentences = nltk.sent_tokenize(text)
    number_of_sentences=len(sentences)
    if len(sentences) > 0 :
        average_word_per_sentence = len(tokens)/len(sentences)
    
    return(average_sentence_length)
    

In [25]:
def sentiment_analysis(file_path_positive,file_path_negative,filtered_tokens):
    """
    Sentimental analysis is the process of determining whether a piece of writing is positive, negative, or neutral.
    
    Args:
    - file_path_positive(str) : The name of text file storing Positive Dictionary with positive words.
    - file_path_negative(str) : The name of text file storing Negative Dictionary with negative words.
    - filtered_tokens(list) : The list of tokens.
    
    Returns:
    - positive_score(int): +1 for each word if found in the Positive Dictionary and then adding up all the values.
    - negative_score(int): -1 for each word if found in the Negative Dictionary and then adding up all the values.
    - polarity_score(float): (Positive Score – Negative Score)/ ((Positive Score + Negative Score) + 0.000001).
    - subjectivity_score(float): (Positive Score + Negative Score)/ ((Total Words after cleaning) + 0.000001).
    
    """
   
    # Read the content of the positive token text file
    with open(file_path_positive, 'r') as file:
        text = file.read()
    
    # Tokenize the text using NLTK's word_tokenize function
    poz_tokens = word_tokenize(text)

    # Read the content of the negative token text file
    with open(file_path_negative, 'r') as file:
        text = file.read()
    # Tokenize the text using NLTK's word_tokenize function
    neg_tokens = word_tokenize(text)
    
    #intialize positive_score, negative_score to 0
    positive_score = 0
    negative_score=0
    
    #count number of positive and negative words in the filtered_text
    for token in filtered_tokens:
        if token in poz_tokens:
            positive_score += 1
        if token in neg_tokens:
            negative_score -= 1
    
    #compute ploarity score
    polarity_score = (positive_score - negative_score)/ ((positive_score + negative_score) + 0.000001)
    
    #compute subjectivity_score
    subjectivity_score = (positive_score + negative_score)/ ((len(filtered_tokens)) + 0.000001)
    
    #print(positive_score)
    #print(negative_score)
    #print(polarity_score)
    #print(subjectivity_score)
    return(positive_score,negative_score,polarity_score,subjectivity_score)

In [30]:
#from openpyxl import load_workbook

if __name__ == "__main__":
    
    # Specify the URL of the webpage you want to fetch content from
    url = "https://insights.blackcoffer.com/rising-it-cities-and-its-impact-on-the-economy-environment-infrastructure-and-city-life-by-the-year-2040-2/"
    url_id="blackassign0001"
    
    #input file name storing url and url_id
    input_file_path="C:/archana/pythoncode1/20211030 Test Assignment/Input.xlsx"
    
    #destination folder name for storing text files
    folder_name="C:/archana/pythoncode1/20211030 Test Assignment/results"
    
    #folder storing stopwatch files
    stopwords_folder_path = "C:/archana/pythoncode1/20211030 Test Assignment/StopWords"
    
    # Path to the text file containing positive negative words
    file_path_positive = "C:/archana/pythoncode1/20211030 Test Assignment/MasterDictionary/positive-words.txt"
    file_path_negative = "C:/archana/pythoncode1/20211030 Test Assignment/MasterDictionary/negative-words.txt"
    
    #Path to the excel file to store output metrics
    output_file_path = "C:/archana/pythoncode1/20211030 Test Assignment/results/Output Data Structure.xlsx"
        

    # Load the workbook
    workbook = load_workbook(input_file_path)

    # Select the active worksheet
    worksheet = workbook.active
    
        
    # Iterate over rows and read 'url_id' and 'url' from each row
    for row in worksheet.iter_rows(min_row=2, values_only=True):  # Assuming the headers are in the first row
        url_id, url = row[0], row[1]
        
        #Initialize all ouput metrics to zero
        sentiment_score=[0,0,0,0]
        average_sentence_length=0
        syllable=[0,0,0]
        fog_index=0
        average_word_per_sentence=0
        word_count=0
        personal_pronoun_count=0
        average_word_length=0
        
        #read text from given url and  write text in the file url_id
        file_name = f"{url_id}.txt"
        output_text_file = os.path.join(folder_name, file_name)
        
        #compute metrics if url exists
        response = requests.head(url)
        if response.status_code == 200:
            
            # Save the extracted article text from url to a output_text_file
            create_text_file(url,output_text_file)

            #get stop words from files stored stopwords_folder_path
            stop_words_list=get_stopwords(stopwords_folder_path)

            #tokenize text of output_text_file
            tokens=tokanize_text(output_text_file)

            #delete useless stop_words
            filtered_tokens = [token for token in tokens if token not in stop_words_list]  # Filter out tokens to delete

            #Sentiment analysis of filtered tokens
            sentiment_score=sentiment_analysis(file_path_positive,file_path_negative,filtered_tokens)

            #count personal pronouns in the output_text_file
            personal_pronoun_count=personal_pronouns(output_text_file)

            #count syllable and complex words
            syllable=sylabble_analysis(tokens)

            #count total cleaned words
            word_count=count_words(filtered_tokens)

            #calculate average word length
            average_word_length=calculate_average_word_length(tokens)

            #calculate average sentence length
            average_sentence_length=calculate_average_sentence_length(output_text_file)

            #calculate average word per sentence
            average_word_per_sentence=calculate_average_word_per_sentence(output_text_file)

            #calculate Fog Index
            fog_index = 0.4 * (average_sentence_length + syllable[0])

        #write output metrics to 'Output Data Structure.xlsx file'
        headers = ['URL_ID', 'URL','POSITIVE SCORE','NEGATIVE SCORE','POLARITY SCORE','SUBJECTIVITY SCORE','AVG SENTENCE LENGTH',
                  'PERCENTAGE OF COMPLEX WORDS',',FOG INDEX','AVG NUMBER OF WORDS PER SENTENCE','COMPLEX WORD COUNT','WORD COUNT',
                  'SYLLABLE PER WORD','PERSONAL PRONOUNS','AVG WORD LENGTH']

        record = (url_id, url,sentiment_score[0],sentiment_score[1],sentiment_score[2],sentiment_score[3],average_sentence_length,
                 syllable[0],fog_index,average_word_per_sentence,syllable[1],word_count,syllable[2],personal_pronoun_count,
                  average_word_length)

        # Check if the Excel file exists. If not create the file with heads
        if not os.path.exists(output_file_path):
            create_excel_file(output_file_path, headers)

        # Load the existing workbook
        wb = load_workbook(output_file_path)    

        # Write the record to the Excel file
        write_record_to_excel(wb, record)

        # Save the workbook
        wb.save(output_file_path)

    print("Record has been written to the Excel file.")


Record has been written to the Excel file.
