In [4]:
#Data Engineer @DRDO

import pandas as pd
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

# Download the punkt tokenizer from nltk (if not already)
nltk.download('punkt')

# 1. Data Extraction
def extract_content(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        paragraphs = soup.find_all('p')
        return ' '.join([p.get_text() for p in paragraphs])
    except:
        return ""  # Return empty string if there's an issue with the URL

# 2. Text Analysis
def count_syllables(word):
    count = 0
    vowels = 'aeiouy'
    word = word.lower().strip(".:;?!")
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith('e'):
        count -= 1
    if count == 0:
        count += 1
    return count

def compute_metrics(text):
    sentences = sent_tokenize(text)
    words = word_tokenize(text)

    # Filter out punctuation and set to lowercase
    words = [word for word in words if word.isalpha()]
    words_lower = [word.lower() for word in words]

    # Positive and Negative counts
    positive_count = sum(1 for word in words_lower if word in positive_words)
    negative_count = sum(1 for word in words_lower if word in negative_words)

    # Polarity and Subjectivity
    polarity_score = (positive_count - negative_count) / ((positive_count + negative_count) or 1)
    subjectivity_score = (positive_count + negative_count) / len(words)

    # Average sentence length
    avg_sentence_length = len(words) / len(sentences)

    # Complex words and their percentages
    complex_words = [word for word in words if count_syllables(word) > 2]
    percentage_complex = len(complex_words) / len(words)

    # Fog index
    fog_index = 0.4 * (avg_sentence_length + percentage_complex * 100)

    # Personal pronouns
    personal_pronouns_count = sum(1 for word in words if word in ['I', 'we', 'my', 'ours', 'us'])

    return [
        positive_count, negative_count, polarity_score, subjectivity_score,
        avg_sentence_length, percentage_complex * 100, fog_index,
        len(complex_words), len(words), sum(count_syllables(word) for word in words) / len(words),
        personal_pronouns_count, sum(len(word) for word in words) / len(words)
    ]


# Main Execution
df_input = pd.read_excel('C:/Users/Deepa/OneDrive/Desktop/.ipynb_checkpoints/Input.xlsx')
positive_words = set(open("C:/Users/Deepa/OneDrive/Desktop/.ipynb_checkpoints/MasterDictionary/positive-words.txt").read().splitlines())
negative_words = set(open("C:/Users/Deepa/OneDrive/Desktop/.ipynb_checkpoints/MasterDictionary/negative-words.txt").read().splitlines())

results = []

for _, row in df_input.iterrows():
    content = extract_content(row['URL'])
    metrics = compute_metrics(content)
    results.append([row['URL_ID'], row['URL'], content] + metrics)  # Added the 'content' to results

df_output = pd.DataFrame(results, columns=[
    "URL_ID", "URL", "Content", "POSITIVE COUNT", "NEGATIVE COUNT", "POLARITY SCORE",
    "SUBJECTIVITY SCORE", "AVG SENTENCE LENGTH", "PERCENTAGE OF COMPLEX WORDS", "FOG INDEX",
    "COMPLEX WORD COUNT", "WORD COUNT", "AVG SYLLABLES PER WORD", "PERSONAL PRONOUNS COUNT", "AVG WORD LENGTH"
])

df_output.to_excel('Output.xlsx', index=False)
print("Analysis completed and saved to Output.xlsx")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Deepa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Analysis completed and saved to Output.xlsx


In [None]:
## the above code takes 30 seconds to get all the data analysis and extration of data , 
##which is very fast in comaprision to toher peopel codes .