In [1]:
# Import necessary packages
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re

In [2]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\AFZAL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\AFZAL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Read the URL file into a pandas DataFrame
df = pd.read_excel('Input.xlsx')

In [5]:
# Ensure the output directory exists
output_dir = 'TitleText'
os.makedirs(output_dir, exist_ok=True)

In [29]:
# Track the URLs that failed during scraping
failed_urls = []

In [30]:
# Loop through each row in the DataFrame
for index, row in df.iterrows():
    url = row['URL']
    url_id = row['URL_ID']

    # Make a request to the URL
    header = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"}
    try:
        response = requests.get(url, headers=header)
        response.raise_for_status()
    except:
        print(f"Can't get response for URL_ID {url_id}")
        failed_urls.append(url_id)
        continue

    # Create a BeautifulSoup object
    try:
        soup = BeautifulSoup(response.content, 'html.parser')
    except:
        print(f"Can't parse page for URL_ID {url_id}")
        failed_urls.append(url_id)
        continue

    # Find title
    try:
        title = soup.find('h1').get_text()
    except:
        print(f"Can't get title for URL_ID {url_id}")
        failed_urls.append(url_id)
        continue

    # Find text
    article = ""
    try:
        for p in soup.find_all('p'):
            article += p.get_text()
    except:
        print(f"Can't get text for URL_ID {url_id}")
        failed_urls.append(url_id)
        continue

    # Write title and text to the file
    file_name = f'{output_dir}/{url_id}.txt'
    with open(file_name, 'w', encoding='utf-8') as file:
        file.write(title + '\n' + article)



Can't get response for URL_ID blackassign0036
Can't get response for URL_ID blackassign0049


In [31]:
# Directories
text_dir = "C:/Users/AFZAL/Downloads/20211030 Test Assignment/TitleText"
stopwords_dir = "C:/Users/AFZAL/Downloads/20211030 Test Assignment/StopWords"
sentiment_dir = "C:/Users/AFZAL/Downloads/20211030 Test Assignment/MasterDictionary"

In [33]:
# Load all stop words from the stopwords directory and store in a set
stop_words = set()
for file in os.listdir(stopwords_dir):
    with open(os.path.join(stopwords_dir, file), 'r', encoding='ISO-8859-1') as f:
        stop_words.update(set(f.read().splitlines()))

In [34]:
# Load all text files from the directory and store in a list (docs)
docs = []
for text_file in os.listdir(text_dir):
    with open(os.path.join(text_dir, text_file), 'r', encoding='utf-8') as f:
        text = f.read()
        # Tokenize the text
        words = word_tokenize(text)
        # Remove stop words from the tokens
        filtered_text = [word for word in words if word.lower() not in stop_words]
        # Add each filtered tokens of each file into a list
        docs.append(filtered_text)

In [35]:
# Store positive and negative words from the directory
pos = set()
neg = set()

for file in os.listdir(sentiment_dir):
    if file == 'positive-words.txt':
        with open(os.path.join(sentiment_dir, file), 'r', encoding='ISO-8859-1') as f:
            pos.update(f.read().splitlines())
    elif file == 'negative-words.txt':
        with open(os.path.join(sentiment_dir, file), 'r', encoding='ISO-8859-1') as f:
            neg.update(f.read().splitlines())

In [36]:
# Collect the positive and negative words from each file and calculate the scores
positive_words = []
negative_words = []
positive_score = []
negative_score = []
polarity_score = []
subjectivity_score = []

In [37]:
# Iterate through the list of docs
for doc in docs:
    positive = [word for word in doc if word.lower() in pos]
    negative = [word for word in doc if word.lower() in neg]
    positive_words.append(positive)
    negative_words.append(negative)
    positive_score.append(len(positive))
    negative_score.append(len(negative))
    polarity_score.append((len(positive) - len(negative)) / ((len(positive) + len(negative)) + 0.000001))
    subjectivity_score.append((len(positive) + len(negative)) / (len(doc) + 0.000001))

In [38]:
# Functions to calculate various readability metrics
def measure(file):
    with open(os.path.join(text_dir, file), 'r', encoding='utf-8') as f:
        text = f.read()
        text = re.sub(r'[^\w\s.]', '', text)
        sentences = text.split('.')
        num_sentences = len(sentences)
        words = [word for word in text.split() if word.lower() not in stopwords.words('english')]
        num_words = len(words)

        complex_words = [word for word in words if sum(1 for letter in word if letter.lower() in 'aeiou') > 2]

        syllable_count = 0
        syllable_words = []
        for word in words:
            if word.endswith('es'):
                word = word[:-2]
            elif word.endswith('ed'):
                word = word[:-2]
            syllable_count_word = sum(1 for letter in word if letter.lower() in 'aeiou')
            if syllable_count_word >= 1:
                syllable_words.append(word)
                syllable_count += syllable_count_word

        avg_sentence_len = num_words / num_sentences
        avg_syllable_word_count = syllable_count / len(syllable_words)
        percent_complex_words = len(complex_words) / num_words
        fog_index = 0.4 * (avg_sentence_len + percent_complex_words)

        return avg_sentence_len, percent_complex_words, fog_index, len(complex_words), avg_syllable_word_count

avg_sentence_length = []
percentage_of_complex_words = []
fog_index = []
complex_word_count = []
avg_syllable_word_count = []

In [39]:
# Iterate through each file or doc
for file in os.listdir(text_dir):
    x, y, z, a, b = measure(file)
    avg_sentence_length.append(x)
    percentage_of_complex_words.append(y)
    fog_index.append(z)
    complex_word_count.append(a)
    avg_syllable_word_count.append(b)

In [40]:
# Function to calculate word count and average word length
def cleaned_words(file):
    with open(os.path.join(text_dir, file), 'r', encoding='utf-8') as f:
        text = f.read()
        text = re.sub(r'[^\w\s]', '', text)
        words = [word for word in text.split() if word.lower() not in stopwords.words('english')]
        length = sum(len(word) for word in words)
        average_word_length = length / len(words)
    return len(words), average_word_length

word_count = []
average_word_length = []

In [41]:
# Iterate through each file or doc
for file in os.listdir(text_dir):
    x, y = cleaned_words(file)
    word_count.append(x)
    average_word_length.append(y)


In [42]:
# Function to count personal pronouns
def count_personal_pronouns(file):
    with open(os.path.join(text_dir, file), 'r', encoding='utf-8') as f:
        text = f.read()
        personal_pronouns = ["I", "we", "my", "ours", "us"]
        count = 0
        for pronoun in personal_pronouns:
            count += len(re.findall(r"\b" + pronoun + r"\b", text))
    return count

pp_count = []

In [43]:
# Iterate through each file or doc
for file in os.listdir(text_dir):
    x = count_personal_pronouns(file)
    pp_count.append(x)


In [44]:
# Load the output structure and populate it with the calculated values
output_df = pd.read_excel('Output Data Structure.xlsx')


In [46]:
# Remove rows corresponding to failed URLs
output_df = output_df[~output_df['URL_ID'].isin(failed_urls)]


In [47]:
# List of variables to be added to the output DataFrame
variables = [
    positive_score, negative_score, polarity_score, subjectivity_score,
    avg_sentence_length, percentage_of_complex_words, fog_index,
    avg_sentence_length, complex_word_count, word_count,
    avg_syllable_word_count, pp_count, average_word_length
]


In [48]:
# Ensure the lengths of the variables match the number of rows in the DataFrame
for var in variables:
    if len(var) != len(output_df):
        raise ValueError(f"Length mismatch: variable length {len(var)} does not match DataFrame length {len(output_df)}")

In [49]:
# Write the values to the DataFrame
for i, var in enumerate(variables):
    output_df.iloc[:, i+2] = var


In [50]:
# Save the DataFrame to disk
output_df.to_csv('Output_Data.csv', index=False)