In [1]:
#imports
import os
import re
import time
import nltk
import requests
import pandas as pd
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize, sent_tokenize

In [3]:
file_path = 'Input.xlsx'
df = pd.read_excel(file_path)

In [5]:
df.head()

Unnamed: 0,URL_ID,URL
0,bctech2011,https://insights.blackcoffer.com/ml-and-ai-bas...
1,bctech2012,https://insights.blackcoffer.com/streamlined-i...
2,bctech2013,https://insights.blackcoffer.com/efficient-dat...
3,bctech2014,https://insights.blackcoffer.com/effective-man...
4,bctech2015,https://insights.blackcoffer.com/streamlined-t...


#### Extracting articles from the articles

In [9]:
def extract_article(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'lxml')
        
        # Extract the title
        title = soup.find('h1').get_text(strip=True) if soup.find('h1') else 'No Title Found'
        
        # Extract the article text
        paragraphs = soup.find_all('p')
        article_text = '\n'.join([para.get_text(strip=True) for para in paragraphs])
        
        return f"{title}\n\n{article_text}"
    except Exception as e:
        return str(e)

# Process each URL and save the content
for index, row in df.iterrows():
    start_time = time.time()
    url_id = row['URL_ID']
    url = row['URL']
    article_content = extract_article(url)
    
    # Save the article content to a text file
    with open(f'{url_id}.txt', 'w', encoding='utf-8') as file:
        file.write(article_content)

print("Articles have been successfully extracted and saved.")
time_difference = time.time() - start_time
print(f'Scrapping time: %.2f seconds.' % time_difference) 

Articles have been successfully extracted and saved.
Scrapping time: 2.94 seconds.


#### Step 1: Sentiment Analysis

##### 1.1 Cleaning using Stop Words lists

Start by loading the stopwords and cleaning the text.

In [14]:
def load_stopwords(file_paths):
    """This function loads the list of Stop Words."""
    stopwords = set()
    for file_path in file_paths:
        with open(file_path, 'r') as file:
            words = file.read().splitlines()
            stopwords.update(word.strip().lower() for word in words if word.strip())
    return stopwords

##### 1.2 Modify the Text Cleaning Function

Use the combined of custom stop words set for cleaning the text.

In [65]:
def clean_text(text, stop_words):
    tokens = word_tokenize(text.lower())
    cleaned_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return cleaned_tokens

def process_file(file_path, stop_words):
    with open(file_path, 'r',encoding="utf8") as file:
        text = file.read()
    cleaned_tokens = clean_text(text, stop_words)
    return cleaned_tokens, text

##### 1.3 Creating a Dictionary of Positive and Negative Words

Load your positive and negative words dictionary.

In [24]:
def load_dictionary(file_path):
    with open(file_path, 'r') as file:
        words = file.read().splitlines()
    return set(words)
positive_words = load_dictionary('MasterDictionary/positive-words.txt')
negative_words = load_dictionary('MasterDictionary/negative-words.txt')

##### 1.4 Extracting Derived Variables

In [27]:
def calculate_sentiment_scores(tokens, positive_words, negative_words):
    positive_score = sum(1 for word in tokens if word in positive_words)
    negative_score = sum(1 for word in tokens if word in negative_words)

    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (len(tokens) + 0.000001)

    return positive_score, negative_score, polarity_score, subjectivity_score

#### Step 2: Analysis of Readability

Gunning Fog Index Calculation

In [31]:
def gunning_fog_index(text):
    sentences = nltk.sent_tokenize(text)
    words = word_tokenize(text)
    
    complex_words = [word for word in words if len([char for char in word if char.lower() in 'aeiou']) > 2]
    
    average_sentence_length = len(words) / len(sentences)
    percentage_complex_words = len(complex_words) / len(words)
    
    fog_index = 0.4 * (average_sentence_length + percentage_complex_words)
    
    return fog_index  

#### Step 3: Average Number of Words Per Sentence

In [34]:
def average_words_per_sentence(tokens):
    total_chars = sum(len(word) for word in tokens if word.isalpha())
    return total_chars / len(tokens) if tokens else 0

#### Step 4: Complex Word Count

In [37]:
def count_complex_words(tokens):
    complex_words = [word for word in tokens if len([char for char in word if char.lower() in 'aeiou']) > 2]
    return len(complex_words)

#### Step 5: Word Count

In [40]:
def word_count(text, stop_words):
    tokens = clean_text(text, stop_words)
    return len(tokens)

#### Step 6: Syllable Count Per Word

In [43]:
def syllable_count(word):
    word = [x.lower() for x in word]
    syllables = len([char for char in word if char in 'aeiou'])
    for x in word:
        if x.endswith(('es', 'ed')):
            syllables -= 1
        return max(1, syllables)

#### Step 7: Personal Pronouns Count

In [46]:
def count_personal_pronouns(text):
    pronouns = re.findall(r'\b(I|we|my|ours|us)\b', text, re.I)
    return len(pronouns)

#### Step 8: Average Word Length

In [49]:
def average_word_length(tokens):
    total_chars = sum(len(word) for word in tokens if word.isalpha())
    return total_chars / len(tokens) if tokens else 0

#### Step 9: Average length of Sentence

In [52]:
def average_sentence_length(text):
    # Split the text into sentences
    sentences = sent_tokenize(text)
    
    # Split each sentence into words and count them
    sentence_lengths = [len(word_tokenize(sentence)) for sentence in sentences]
    
    # Calculate the average sentence length
    if len(sentence_lengths) > 0:
        avg_sentence_length = sum(sentence_lengths) / len(sentence_lengths)
    else:
        avg_sentence_length = 0
    
    return avg_sentence_length

#### Step 10: Analysis of the files

In [55]:
def analyze_file(file_path, stop_words,text_id):
    # Process the file and get cleaned tokens and original text
    cleaned_tokens, original_text = process_file(file_path, stop_words)
    
    # Calculate sentiment analysis scores
    positive_score, negative_score, polarity_score, subjectivity_score = calculate_sentiment_scores(cleaned_tokens, positive_words, negative_words)
    
    # Calculate readability metrics
    fog_index = gunning_fog_index(original_text)
    average_words = average_words_per_sentence(original_text)
    complex_word_count = count_complex_words(cleaned_tokens)
    syllable_per_word = syllable_count(cleaned_tokens)
    avg_sent_len = average_sentence_length(original_text)
    
    # Calculate other metrics
    total_words = len(cleaned_tokens)
    pronoun_count = count_personal_pronouns(original_text)
    avg_word_length = average_word_length(cleaned_tokens)
    perc_complex_words = complex_word_count/total_words
    
    # Create a dictionary with all the metrics
    result = {
        'Text ID': text_id,
        'Positive Score': positive_score,
        'Negative Score': negative_score,
        'Polarity Score': polarity_score,
        'Subjectivity Score': subjectivity_score,
        'Avg Sentence Length':avg_sent_len,
        'Percentage of Complex Words':perc_complex_words,
        'Fog Index': fog_index,
        'Avg Number Words per Sentence': average_words,
        'Complex Word Count': complex_word_count,
        'Word Count': total_words,
        'Syllable per Word':syllable_per_word,
        'Personal Pronouns': pronoun_count,
        'Average Word Length': avg_word_length
    }
    
    # Convert to DataFrame to facilitate appending to Excel
    result_df = pd.DataFrame([result])
    
    return result_df

In [69]:
def main():
    stopword_files = [
    r'StopWords\StopWords_Auditor.txt',
    r'StopWords\StopWords_Currencies.txt',
    r'StopWords\StopWords_DatesandNumbers.txt',
    r'StopWords\StopWords_Generic.txt',
    r'StopWords\StopWords_GenericLong.txt',
    r'StopWords\StopWords_Geographic.txt',
    r'StopWords\StopWords_Names.txt'
    ]
    custom_stopwords = load_stopwords(stopword_files)

    # List of files to process
    text_files = [
        'bctech2011.txt','bctech2012.txt','bctech2013.txt','bctech2014.txt','bctech2015.txt','bctech2016.txt','bctech2017.txt','bctech2018.txt',
        'bctech2019.txt','bctech2020.txt','bctech2021.txt','bctech2022.txt','bctech2023.txt','bctech2024.txt','bctech2025.txt','bctech2026.txt',
        'bctech2027.txt','bctech2028.txt','bctech2029.txt','bctech2030.txt','bctech2031.txt','bctech2032.txt','bctech2033.txt','bctech2034.txt',
        'bctech2035.txt','bctech2036.txt','bctech2037.txt','bctech2038.txt','bctech2039.txt','bctech2040.txt','bctech2041.txt','bctech2042.txt',
        'bctech2043.txt','bctech2044.txt','bctech2045.txt','bctech2046.txt','bctech2047.txt','bctech2048.txt','bctech2049.txt','bctech2050.txt',
        'bctech2051.txt','bctech2052.txt','bctech2053.txt','bctech2054.txt','bctech2055.txt','bctech2056.txt','bctech2057.txt','bctech2058.txt',
        'bctech2059.txt','bctech2060.txt','bctech2061.txt','bctech2062.txt','bctech2063.txt','bctech2064.txt','bctech2065.txt','bctech2066.txt',
        'bctech2067.txt','bctech2068.txt','bctech2069.txt','bctech2070.txt','bctech2071.txt','bctech2072.txt','bctech2073.txt','bctech2074.txt',
        'bctech2075.txt','bctech2076.txt','bctech2077.txt','bctech2078.txt','bctech2079.txt','bctech2080.txt','bctech2081.txt','bctech2082.txt',
        'bctech2083.txt','bctech2084.txt','bctech2085.txt','bctech2086.txt','bctech2087.txt','bctech2088.txt','bctech2089.txt','bctech2090.txt',
        'bctech2091.txt','bctech2092.txt','bctech2093.txt','bctech2094.txt','bctech2095.txt','bctech2096.txt','bctech2097.txt','bctech2098.txt',
        'bctech2099.txt','bctech2100.txt','bctech2101.txt','bctech2102.txt','bctech2103.txt','bctech2104.txt','bctech2105.txt','bctech2106.txt',
        'bctech2107.txt','bctech2108.txt','bctech2109.txt','bctech2110.txt','bctech2111.txt','bctech2112.txt','bctech2113.txt','bctech2114.txt',
        'bctech2115.txt','bctech2116.txt','bctech2117.txt','bctech2118.txt','bctech2119.txt','bctech2120.txt','bctech2121.txt','bctech2122.txt',
        'bctech2123.txt','bctech2124.txt','bctech2125.txt','bctech2126.txt','bctech2127.txt','bctech2128.txt','bctech2129.txt','bctech2130.txt',
        'bctech2131.txt','bctech2132.txt','bctech2133.txt','bctech2134.txt','bctech2135.txt','bctech2136.txt','bctech2137.txt','bctech2138.txt',
        'bctech2139.txt','bctech2140.txt','bctech2141.txt','bctech2142.txt','bctech2143.txt','bctech2144.txt','bctech2145.txt','bctech2146.txt',
        'bctech2147.txt','bctech2148.txt','bctech2149.txt','bctech2150.txt','bctech2151.txt','bctech2152.txt','bctech2153.txt','bctech2154.txt',
        'bctech2155.txt','bctech2156.txt','bctech2157.txt']

    # Initialize an empty DataFrame to store results
    all_results_df = pd.DataFrame()

    # Process each file
    for idx, file_path in enumerate(text_files, start=1):
        result_df = analyze_file(file_path, custom_stopwords, text_id=idx)
        all_results_df = pd.concat([all_results_df, result_df], ignore_index=True)

    #Load the Outout file
    output_df = pd.read_excel('Output Data Structure.xlsx')
    output_df = output_df.loc[:, ['URL_ID', 'URL']]

    #Final df
    final_df = pd.concat([output_df,all_results_df],axis=1)
    final_df = final_df.drop('Text ID',axis=1)
    

    # Save all results to Excel
    output_file_path = 'Updated_Output_final.xlsx'
    with pd.ExcelWriter(output_file_path, engine='openpyxl') as writer:
        final_df.to_excel(writer, sheet_name='Results', index=False)

    print(f"All analysis results have been saved to {output_file_path}")

if __name__ == "__main__":
    main()

All analysis results have been saved to Updated_Output_final.xlsx
