In [1]:
#import necessary pacakages
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
import re

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\chatt\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chatt\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Reading the dataset

df = pd.read_excel("Input.xlsx")
df

Unnamed: 0,URL_ID,URL
0,Netclan20241017,https://insights.blackcoffer.com/ai-and-ml-bas...
1,Netclan20241018,https://insights.blackcoffer.com/enhancing-fro...
2,Netclan20241019,https://insights.blackcoffer.com/roas-dashboar...
3,Netclan20241020,https://insights.blackcoffer.com/efficient-pro...
4,Netclan20241021,https://insights.blackcoffer.com/development-o...
...,...,...
142,Netclan20241159,https://insights.blackcoffer.com/population-an...
143,Netclan20241160,https://insights.blackcoffer.com/google-lsa-ap...
144,Netclan20241161,https://insights.blackcoffer.com/healthcare-da...
145,Netclan20241162,https://insights.blackcoffer.com/budget-sales-...


In [4]:
# Load Stop words and Master Dictionary

# Folder containing stopwords files
stopwords_folder = 'StopWords'

# Load stopwords from all .txt files in the folder
stop_words = set()
for filename in os.listdir(stopwords_folder):
    if filename.endswith('.txt'):
        with open(os.path.join(stopwords_folder, filename), 'r') as file:
            stop_words.update(file.read().splitlines())

positive_words = set(open('MasterDictionary/positive-words.txt').read().split())
negative_words = set(open('MasterDictionary/negative-words.txt').read().split())

In [5]:
# Defining the utility functions

def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation
    words = word_tokenize(text.lower())       # Tokenize and convert to lowercase
    cleaned = [word for word in words if word not in stop_words]
    return cleaned

Calculating the sentiment scores

In [6]:
def calculate_sentiment_scores(cleaned_words):
    positive_score = sum(1 for word in cleaned_words if word in positive_words)
    negative_score = sum(1 for word in cleaned_words if word in negative_words)
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 1e-6)
    subjectivity_score = (positive_score + negative_score) / (len(cleaned_words) + 1e-6)
    return positive_score, negative_score, polarity_score, subjectivity_score

Analysis of Readibility

In [7]:
from nltk.tokenize import sent_tokenize

def analyze_readability(text):
    sentences = sent_tokenize(text)
    words = word_tokenize(text)
    num_sentences = len(sentences)
    num_words = len(words)
    num_complex_words = sum(1 for word in words if syllable_count(word) > 2)
    avg_sentence_length = num_words / num_sentences
    percentage_complex_words = num_complex_words / num_words
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    return avg_sentence_length, percentage_complex_words, fog_index


In [8]:
# Syllable Count Per Word

def syllable_count(word):
    word = word.lower()
    count = len(re.findall(r'[aeiouy]', word))
    if word.endswith(('es', 'ed')) and len(word) > 2:
        count -= 1
    return max(count, 1)



In [9]:
# Personal Pronouns

def personal_pronouns_count(text):
    pronouns = re.findall(r'\b(I|we|my|ours|us)\b', text, re.IGNORECASE)
    return len(pronouns)


In [10]:
# Average Word Length

def average_word_length(words):
    total_length = sum(len(word) for word in words)
    return total_length / len(words) if words else 0


In [11]:
# Function to extract text from a url

def extract_text_from_url(url):
    try:
        page = requests.get(url)
        if page.status_code != 200:
            return None
        soup = BeautifulSoup(page.text, 'html.parser')
        title = soup.find('h1').text.strip() if soup.find('h1') else ''
        paragraphs = [p.text.strip() for p in soup.find_all('p')]
        content = title + ' ' + ' '.join(paragraphs)
        return content
    except Exception as e:
        print(f"Error fetching URL {url}: {e}")
        return None
    
    

In [12]:
# Main function to process a URL and return analysis

def process_url(url):
    text = extract_text_from_url(url)
    if not text:
        return None

    cleaned_words = clean_text(text)
    positive_score, negative_score, polarity_score, subjectivity_score = calculate_sentiment_scores(cleaned_words)
    avg_sentence_length, percentage_complex_words, fog_index = analyze_readability(text)
    num_words = len(cleaned_words)
    num_complex_words = sum(1 for word in cleaned_words if syllable_count(word) > 2)
    syllables_per_word = sum(syllable_count(word) for word in cleaned_words) / num_words
    pronoun_count = personal_pronouns_count(text)
    avg_word_length = average_word_length(cleaned_words)

    return {
        'URL_ID': None,
        'URL': None,     
        'Positive Score': positive_score,
        'Negative Score': negative_score,
        'Polarity Score': polarity_score,
        'Subjectivity Score': subjectivity_score,
        'Average Sentence Length': avg_sentence_length,
        'Percentage of Complex Words': percentage_complex_words,
        'Fog Index': fog_index,
        'AVG NUMBER OF WORDS PER SENTENCE': avg_sentence_length, 
        'COMPLEX WORD COUNT': num_complex_words,
        'Word Count': num_words,
        'Syllable Count Per Word': syllables_per_word,
        'Personal Pronouns': pronoun_count,
        'Average Word Length': avg_word_length
    }



In [13]:
# Read the input DataFrame and process URLs
def analyze_urls(input_df, output_file):
    results = []

    for index, row in input_df.iterrows():
        url_id = row['URL_ID']
        url = row['URL']
        print(f"Processing URL_ID {url_id}: {url}")

        analysis = process_url(url)
        if analysis:
            analysis['URL_ID'] = url_id
            analysis['URL'] = url
            results.append(analysis)
        else:
            print(f"Failed to process URL_ID {url_id}: {url}")

    # Create a DataFrame from results
    results_df = pd.DataFrame(results)
    

    column_order = ['URL_ID', 'URL', 'Positive Score', 'Negative Score', 'Polarity Score', 'Subjectivity Score',
                    'Average Sentence Length', 'Percentage of Complex Words', 'Fog Index',
                    'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT',
                    'Word Count', 'Syllable Count Per Word', 'Personal Pronouns', 'Average Word Length']

    results_df.to_excel(output_file, index=False)


In [15]:
if __name__ == "__main__":
    
    # Output Excel file
    output_excel = "Output Data Structure.xlsx"

    # Perform analysis
    analyze_urls(df, output_excel)


Processing URL_ID Netclan20241017: https://insights.blackcoffer.com/ai-and-ml-based-youtube-analytics-and-content-creation-tool-for-optimizing-subscriber-engagement-and-content-strategy/
Processing URL_ID Netclan20241018: https://insights.blackcoffer.com/enhancing-front-end-features-and-functionality-for-improved-user-experience-and-dashboard-accuracy-in-partner-hospital-application/
Processing URL_ID Netclan20241019: https://insights.blackcoffer.com/roas-dashboard-for-campaign-wise-google-ads-budget-tracking-using-google-ads-ap/
Processing URL_ID Netclan20241020: https://insights.blackcoffer.com/efficient-processing-and-analysis-of-financial-data-from-pdf-files-addressing-formatting-inconsistencies-and-ensuring-data-integrity-for-a-toyota-dealership-management-firm/
Processing URL_ID Netclan20241021: https://insights.blackcoffer.com/development-of-ea-robot-for-automated-trading/
Processing URL_ID Netclan20241022: https://insights.blackcoffer.com/ai-and-ml-based-youtube-analytics-and-c