## Outline of the program

## We will define various functions according to the task we want to perform.

### The tasks are as follows:


#### Step 1)  Importing the excel file containing the list of urls
#### Step 2)  Iterating through the urls, we have to scrape the main article text from the webpage of given url.
#### step 3)  Clean the the scraped text by removing HTML tags
#### step 4)  Using Natural Language Processing, tokeninze the text.
#### step 5)  Remove the stopwords with the help of given list of stopwords.
#### step 6)  Sentiment analysis by comparing the text with master dictionary given.
#### step 7)  Calculating the score for various parameters asked.
#### step 8)  Writing the output one by one in the given format in an excel sheet.

## Given below is the list of functions with their definitions.

In [3]:
import pandas as pd                                          # Importing pandas library
import requests                                              # Importing requests library
from bs4 import BeautifulSoup                               # Importing BeautifulSoup from bs4
import spacy                                                 # Importing spacy library
from textblob import TextBlob                               # Importing TextBlob from textblob
import syllapy                                              # Importing syllapy
import os


def url_links(file_path):
    """
    Reads the URLs from an Excel file.

    Args:
    - file_path (str): The path to the Excel file containing URLs.

    Returns:
    - pandas.DataFrame: DataFrame containing URLs.
    """
    return pd.read_excel(file_path)                          # Reading URLs from an Excel file and returning DataFrame


file_path = 'F:/Blackcoffer assignment/input.xlsx'
positive_words = 'F:/Blackcoffer assignment/MasterDictionary-20230812T205219Z-001/MasterDictionary/positive-words.txt'
negative_words = 'F:/Blackcoffer assignment/MasterDictionary-20230812T205219Z-001/MasterDictionary/negative-words.txt'
output_file = 'F:/Blackcoffer assignment/output.xlsx'



def merge_stopword_files(folder_path, All_stop_words_path):
   
    stopword_files = []   
    for filename in os.listdir(folder_path):      # Iterate over the files in the folder        
        if filename.endswith('.txt'):             # Check if the file is a text file
            stopword_files.append(os.path.join(folder_path, filename))      # Append the full path of the file to the list 
    with open(All_stop_words_path, 'w') as AllStopWords:     # Open the new file in write mode  
        for file_path in stopword_files:    # Iterate through each file in the stopword_files list          
            with open(file_path, 'r') as file:  # Open each file in read mode             
                # Read the contents of the file
                content = file.read()
                # Write the contents to the new file
                AllStopWords.write(content)
                # Add a newline character to separate contents from different files
                AllStopWords.write('\n')
# Specify the folder path containing stop word files
folder_path = 'F:/Blackcoffer assignment/StopWords-20230812T205218Z-001/StopWords'
# Define the path of the new file
All_stop_words_path = 'F:/Blackcoffer assignment/StopWords-20230812T205218Z-001/StopWords/AllStopWords.txt'
# Call the function to merge stop word files
merge_stopword_files(folder_path, All_stop_words_path)




def scrape_text(url):
    """
    Scrapes the text content from a given URL.

    Args:
    - url (str): The URL of the webpage to scrape.

    Returns:
    - list: List containing text content scraped from the webpage.
    """
    article_text = requests.get(url).text                    # Getting text content from the URL
    soup = BeautifulSoup(article_text, 'lxml')               # Creating BeautifulSoup object
    article = soup.find_all('div', class_='td-post-content tagdiv-type')    # Finding text content with specific class
    if not article:
        article = soup.find_all('div', class_='tdb-block-inner td-fix-index')   # Finding text content with specific class if the first search fails
    return article             # Returning the scraped text content






def clean_the_text(article):
    """
    Cleans the extracted text content. i.e. removing the html tags.

    Args:
    - article (list): List containing text content scraped from the webpage.

    Returns:
    - str: Cleaned text content.
    """
    cleaned_article = ''                                     # Initializing an empty string for cleaned text content
    for block in article:
        block_text = block.get_text(strip=True)              # Getting text content without extra spaces
        cleaned_article += block_text.strip()                # Adding the cleaned text content to the string
        cleaned_article = cleaned_article.replace('\n', ' ')  # Replacing newline characters with spaces
    return cleaned_article                                   # Returning the cleaned text content

def tokenize_article(cleaned_article, stopword_file):
    """
    Tokenizes the cleaned text content.

    Args:
    - cleaned_article (str): Cleaned text content.
    - stopword_file (str): Path to the file containing stop words.

    Returns:
    - list: List of filtered tokens.
    """
    with open(stopword_file, 'r') as stopwords_file:         # Opening the stop words file
        stopwords = stopwords_file.read().splitlines()       # Reading stop words and splitting by lines
        
    nlp = spacy.load('en_core_web_lg')                       # Loading the English language model
    doc = nlp(cleaned_article)                               # Creating a document object with Spacy
    filtered_tokens = [token.text.lower() for token in doc if token.text.lower() not in stopwords]    # Filtering tokens based on stop words
    return filtered_tokens                                   # Returning the filtered tokens

def calculate_scores(filtered_tokens, positive_words_file, negative_words_file):
    """
    Calculates sentiment scores.

    Args:
    - filtered_tokens (list): List of filtered tokens.
    - positive_words_file (str): Path to the file containing positive words.
    - negative_words_file (str): Path to the file containing negative words.

    Returns:
    - tuple: Tuple containing positive score, negative score, polarity score, and subjectivity score.
    """
    with open(positive_words_file, 'r', encoding='latin-1') as pwords:   # Opening positive words file
        positive_words = set(pwords.read().splitlines())         # Reading positive words and converting to set
        
    with open(negative_words_file, 'r', encoding='latin-1') as nwords:   # Opening negative words file
        negative_words = set(nwords.read().splitlines())         # Reading negative words and converting to set
        
    positive_score = sum(1 for word in filtered_tokens if word in positive_words)    # Calculating positive score
    negative_score = sum(1 for word in filtered_tokens if word in negative_words)    # Calculating negative score
    polarity_score = (positive_score - negative_score) / (positive_score + negative_score + 1e-10)    # Calculating polarity score
    subjectivity_score = TextBlob(' '.join(filtered_tokens)).sentiment.subjectivity    # Calculating subjectivity score
    return positive_score, negative_score, polarity_score, subjectivity_score    # Returning scores

def analyze_text(filtered_tokens):
    """
    Analyzes the text content.

    Args:
    - filtered_tokens (list): List of filtered tokens.

    Returns:
    - tuple: Tuple containing various analysis results.
    """
    num_sentences = len(TextBlob(' '.join(filtered_tokens)).sentences)    # Getting the number of sentences
    
    # Check if filtered_tokens is empty
    if not filtered_tokens:
        # Return zeros for all analysis results
        return 0, 0, 0, 0, 0, 0, 0, 0, 0
    
    avg_sentence_length = len(filtered_tokens) / num_sentences    # Calculating average sentence length
    percentage_complex_words = sum(1 for word in filtered_tokens if len(word) > 6) / len(filtered_tokens)    # Calculating percentage of complex words
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)    # Calculating FOG index
    avg_num_words_per_sentence = len(filtered_tokens) / num_sentences    # Calculating average number of words per sentence
    complex_word_count = sum(1 for word in filtered_tokens if len(word) > 6)    # Calculating count of complex words
    word_count = len(filtered_tokens)    # Getting the total word count
    syllables_per_word = sum(syllapy.count(word) for word in filtered_tokens) / len(filtered_tokens)    # Calculating average syllables per word
    personal_pronouns = sum(1 for word in filtered_tokens if word.lower() in {'i', 'me', 'my', 'mine', 'myself', 'we', 'us', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves'})    # Calculating count of personal pronouns
    avg_word_length = sum(len(word) for word in filtered_tokens) / len(filtered_tokens)    # Calculating average word length
    return avg_sentence_length, percentage_complex_words, fog_index, avg_num_words_per_sentence, complex_word_count, word_count, syllables_per_word, personal_pronouns, avg_word_length    # Returning analysis results

def process_urls(input_file_path, output_file_path, All_stop_words_path, positive_words_file, negative_words_file):
    """
    Processes URLs and saves results to an output file.

    Args:
    - input_file_path (str): Path to the input Excel file containing URLs.
    - output_file_path (str): Path to save the output Excel file.
    - stopword_file_path (str): Path to the file containing stop words.
    - positive_words_file (str): Path to the file containing positive words.
    - negative_words_file (str): Path to the file containing negative words.
    """
    input_df = url_links(input_file_path)    # Reading input Excel file
    output_df = pd.DataFrame(columns=['URL_ID', 'URL', 'Positive Score', 'Negative Score', 'Polarity Score', 'Subjectivity Score', 'Avg Sentence Length', 'Percentage of Complex Words', 'FOG Index', 'Avg Number of Words per Sentence', 'Complex Word Count', 'Word Count', 'Syllable per Word', 'Personal Pronouns', 'Avg Word Length'])    # Creating an empty DataFrame for output

    for index, row in input_df.iterrows():    # Iterating through rows of input DataFrame
        url_id=row['URL_ID']
        url = row['URL']    # Extracting URL from DataFrame
        article_text = scrape_text(url)    # Scraping text content from the URL
        cleaned_text = clean_the_text(article_text)    # Cleaning the extracted text content
        filtered_tokens = tokenize_article(cleaned_text, All_stop_words_path)    # Tokenizing the cleaned text content
        positive_score, negative_score, polarity_score, subjectivity_score = calculate_scores(filtered_tokens, positive_words_file, negative_words_file)    # Calculating sentiment scores
        analysis_results = analyze_text(filtered_tokens)    # Analyzing the text content

        # Add results to output DataFrame
        row_data = {
            'URL_ID': url_id,
            'URL': url,
            'Positive Score': positive_score,
            'Negative Score': negative_score,
            'Polarity Score': polarity_score,
            'Subjectivity Score': subjectivity_score,
            'Avg Sentence Length': analysis_results[0],
            'Percentage of Complex Words': analysis_results[1],
            'FOG Index': analysis_results[2],
            'Avg Number of Words per Sentence': analysis_results[3],
            'Complex Word Count': analysis_results[4],
            'Word Count': analysis_results[5],
            'Syllable per Word': analysis_results[6],
            'Personal Pronouns': analysis_results[7],
            'Avg Word Length': analysis_results[8]
        }
        output_df = pd.concat([output_df, pd.DataFrame([row_data])], ignore_index=True)    # Concatenating row data to output DataFrame

    # Save output DataFrame to Excel file
    output_df.to_excel(output_file_path, index=False)    # Saving output DataFrame to Excel file

In [None]:
def main():
    """
    Main function to execute the entire script.
    """
    file_path = 'F:/Blackcoffer assignment/input.xlsx'    # Path to the input Excel file containing URLs
    All_stop_words_path = 'F:/Blackcoffer assignment/StopWords-20230812T205218Z-001/StopWords/AllStopWords.txt'    # Path to the file containing stop words
    positive_words_file = 'F:/Blackcoffer assignment/MasterDictionary-20230812T205219Z-001/MasterDictionary/positive-words.txt'    # Path to the file containing positive words
    negative_words_file = 'F:/Blackcoffer assignment/MasterDictionary-20230812T205219Z-001/MasterDictionary/negative-words.txt'    # Path to the file containing negative words
    output_file = 'F:/Blackcoffer assignment/output.xlsx'    # Path to save the output Excel file
    
    process_urls(file_path, output_file, All_stop_words_path, positive_words, negative_words)    # Calling process_urls function with specified arguments

if __name__ == "__main__":
    main()    # Calling main function to execute the entire script
