#  **Data Extraction**

In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


# Read the Excel file
df = pd.read_excel('/content/drive/MyDrive/20211030 Test Assignment/Input.xlsx')

# Iterate through each row and extract the article text
for index, row in df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    
    # Create a session object for improved performance
    with requests.Session() as session:
        # Send a GET request to the URL
        response = session.get(url)
    
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find the article title element
    title_element = soup.find('h1')
    if title_element:
        title = title_element.text
    else:
        print(f"Title not found for URL_ID: {url_id}")
        continue
    
    # Find the p tags as they contain the text element
    p_tags = soup.select('.tagdiv-type p')
    
    # Extract the text content of each <p> element using a list comprehension
    paragraphs = [p.get_text() for p in p_tags]
    
    # Check if any paragraphs were extracted
    if not paragraphs:
        print(f"Article not found for URL_ID: {url_id}")
        continue
    
    # Create a text file with the URL_ID as the file name and write the article content
    file_name = f'{url_id}.txt'
    with open(file_name, 'w') as file:
        # Write the title in bold using Unicode characters
        file.write(f'\u2022\u2022\u2022\u2022\u2022 {title} \u2022\u2022\u2022\u2022\u2022\n\n')
        
        # Write the article content
        file.write('\n'.join(paragraphs))


Title not found for URL_ID: 44
Title not found for URL_ID: 57
Title not found for URL_ID: 144


By further Research the three title with URL_ID = "44, 57, 144" which were not found because the page was removed.


# Data Analysis

In [6]:
# Create a list of URL IDs where the title is not found
url_ids_to_remove = [44, 57, 144]

# Filter the DataFrame to keep only the rows where the URL ID is not in the list
df = df[~df['URL_ID'].isin(url_ids_to_remove)]


In [7]:
    import os
    # Define the path to the StopWords folder
    stopwords_folder = '/content/drive/MyDrive/20211030 Test Assignment/StopWords'

    stop_words = set()

    # Iterate through each file in the StopWords folder
    for file_name in os.listdir(stopwords_folder):
        # Read the content of the file
        with open(os.path.join(stopwords_folder, file_name), encoding='latin-1') as file:
           # Extract stopwords from each line in the file
           for line in file:
              # Remove any leading or trailing whitespace
              line = line.strip()
            
              # Check if the line contains " | "
              if " | " in line:
                # Split the line by " | " separator and take the first part
                line = line.split(" | ")[0]

              # Append the modified line to the stop_words set
              stop_words.add(line)


In [9]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import re

results = []
# Iterate through each row and perform the analysis
for index, row in df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    filename = f"{url_id}.txt"
    
    # Read the text from the file
    with open(filename, 'r') as file:
        text = file.read()
 
    # SENTIMENTAL ANALYSIS
    # Cleaning using Stop Words Lists
    cleaned_text = ' '.join([word for word in word_tokenize(text) if word.lower() not in stop_words])

    # Load positive and negative dictionaries
    positive_words = set(open('/content/drive/MyDrive/20211030 Test Assignment/MasterDictionary/positive-words.txt', encoding='latin-1').read().splitlines())
    negative_words = set(open('/content/drive/MyDrive/20211030 Test Assignment/MasterDictionary/negative-words.txt', encoding='latin-1').read().splitlines())


    
    # Creating a dictionary of Positive and Negative words
    positive_score = sum(1 for word in word_tokenize(cleaned_text) if word.lower() in positive_words)
    negative_score = sum(1 for word in word_tokenize(cleaned_text) if word.lower() in negative_words)
    
    # Extracting Derived variables
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (len(word_tokenize(cleaned_text)) + 0.000001)
    

    # Analysis of Readability
    sentences = sent_tokenize(text)
    word_count = sum(1 for word in word_tokenize(text) if re.match(r'^\w+$', word))
    sentence_count = len(sentences)
    average_sentence_length = word_count / sentence_count
    
    complex_words = [word for word in word_tokenize(text) if len(word) > 2 and re.match(r'^\w+$', word)]
    percentage_of_complex_words = len(complex_words) / word_count
    fog_index = 0.4 * (average_sentence_length + percentage_of_complex_words)
    
    # Average Words per Sentence
    average_words_per_sentence = word_count / sentence_count
    
    # Syllable in the article
    syllable_count = sum(1 for word in word_tokenize(text) if re.match(r'^\w+$', word) and len(re.findall(r'[aeiouy]+', word.lower())) > 2 and not word.endswith(('es', 'ed')))
    
    # Personal Pronouns in the article
    personal_pronouns = sum(1 for word in word_tokenize(text) if word.lower() in ['i', 'we', 'my', 'ours', 'us'])
    
    # Average Word Length in the article
    average_word_length = sum(len(word) for word in word_tokenize(text)) / word_count
    
    # Store the analysis results in a dictionary
    analysis_result = {
        'URL_ID': url_id,
        'URL': url,
        'POSITIVE SCORE': positive_score,
        'NEGATIVE SCORE': negative_score,
        'POLARITY SCORE': polarity_score,
        'SUBJECTIVITY SCORE': subjectivity_score,
        'AVG SENTENCE LENGTH': average_sentence_length,
        'PERCENTAGE OF COMPLEX WORDS': percentage_of_complex_words,
        'FOG INDEX': fog_index,
        'AVG NUMBER OF WORDS PER SENTENCE': average_words_per_sentence,
        'COMPLEX WORD COUNT': len(complex_words),
        'WORD COUNT': word_count,
        'SYLLABLE PER WORD': syllable_count,
        'PERSONAL PRONOUNS': personal_pronouns,
        'AVG WORD LENGTH': average_word_length
    }
    
    # Append the result to the list
    results.append(analysis_result)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [11]:
# Convert the results list into a DataFrame
output_df = pd.DataFrame(results)

# Save the DataFrame to an Excel file
output_df.to_excel('Output Data Structure.xlsx', index=False)