In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup



In [2]:
# Function to fetch and extract main content text from a URL
def extract_main_content_text(url):
    try:
        response = requests.get(url, timeout=10)  # Added timeout to handle slow responses
        response.raise_for_status()  # Check if the request was successful
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Removing header and footer (this may vary depending on the structure of the websites)
        for tag in soup.find_all(['header', 'footer', 'nav', 'aside']):
            tag.decompose()

        # Extract text from the remaining content
        main_content = soup.get_text(separator=' ', strip=True)
        return main_content
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return "Error fetching URL"




In [3]:
# Read the Excel file to get the URLs

df = pd.read_excel(r"C:\Users\Ankit Rai\Downloads\Input.xlsx")

# Assuming the URLs are in a column named 'URL'
urls = df['URL']



In [4]:
urls

0      https://insights.blackcoffer.com/ml-and-ai-bas...
1      https://insights.blackcoffer.com/streamlined-i...
2      https://insights.blackcoffer.com/efficient-dat...
3      https://insights.blackcoffer.com/effective-man...
4      https://insights.blackcoffer.com/streamlined-t...
                             ...                        
142    https://insights.blackcoffer.com/population-an...
143    https://insights.blackcoffer.com/google-lsa-ap...
144    https://insights.blackcoffer.com/healthcare-da...
145    https://insights.blackcoffer.com/budget-sales-...
146    https://insights.blackcoffer.com/amazon-buy-bo...
Name: URL, Length: 147, dtype: object

In [5]:
# Extract text from each URL and store in a list
extracted_texts = []
for url in urls:
    text = extract_main_content_text(url)
    extracted_texts.append(text)



In [9]:

# Add the extracted text to the DataFrame
df['Extracted_Text'] = extracted_texts
pd.DataFrame(df)



Unnamed: 0,URL_ID,URL,Extracted_Text
0,bctech2011,https://insights.blackcoffer.com/ml-and-ai-bas...,ML and AI-based insurance premium model to pre...
1,bctech2012,https://insights.blackcoffer.com/streamlined-i...,Streamlined Integration: Interactive Brokers A...
2,bctech2013,https://insights.blackcoffer.com/efficient-dat...,Efficient Data Integration and User-Friendly I...
3,bctech2014,https://insights.blackcoffer.com/effective-man...,Effective Management of Social Media Data Extr...
4,bctech2015,https://insights.blackcoffer.com/streamlined-t...,Streamlined Trading Operations Interface for M...
...,...,...,...
142,bctech2153,https://insights.blackcoffer.com/population-an...,Population and Community Survey of America - B...
143,bctech2154,https://insights.blackcoffer.com/google-lsa-ap...,Google LSA API Data Automation and Dashboardin...
144,bctech2155,https://insights.blackcoffer.com/healthcare-da...,Healthcare Data Analysis - Blackcoffer Insight...
145,bctech2156,https://insights.blackcoffer.com/budget-sales-...,"Budget, Sales KPI Dashboard using Power BI - B..."


In [11]:
# Save the DataFrame to a new csv file
df.to_csv("C:/Users/Ankit Rai/OneDrive/Desktop.csv")



In [14]:
pip install cmudict

Collecting cmudictNote: you may need to restart the kernel to use updated packages.

  Downloading cmudict-1.0.26-py3-none-any.whl (939 kB)
     -------------------------------------- 939.4/939.4 kB 1.1 MB/s eta 0:00:00
Collecting importlib-resources>=5
  Downloading importlib_resources-6.4.0-py3-none-any.whl (38 kB)
Collecting importlib-metadata>=5
  Downloading importlib_metadata-8.0.0-py3-none-any.whl (24 kB)
Installing collected packages: importlib-resources, importlib-metadata, cmudict
  Attempting uninstall: importlib-metadata
    Found existing installation: importlib-metadata 4.11.3
    Uninstalling importlib-metadata-4.11.3:
      Successfully uninstalled importlib-metadata-4.11.3
Successfully installed cmudict-1.0.26 importlib-metadata-8.0.0 importlib-resources-6.4.0


In [23]:
import re
from textblob import TextBlob
import warnings
warnings.filterwarnings("ignore")
import nltk
nltk.download('cmudict')
from nltk.tokenize import word_tokenize, sent_tokenize

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Load the cmudict for syllable count
d = cmudict.dict()




[nltk_data] Downloading package cmudict to C:\Users\Ankit
[nltk_data]     Rai\AppData\Roaming\nltk_data...
[nltk_data]   Package cmudict is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Ankit
[nltk_data]     Rai\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Ankit Rai\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [25]:
def count_syllables(word):
    return [len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]][0] if word.lower() in d else len(re.findall(r'[aeiouy]+', word.lower()))

def count_complex_words(text):
    words = word_tokenize(text)
    complex_words = [word for word in words if count_syllables(word) >= 3]
    return len(complex_words)

def analyze_text(text):
    # Tokenize text
    sentences = sent_tokenize(text)
    words = word_tokenize(text)
    
    # Calculate word count
    word_count = len(words)
    
    # Calculate sentence length
    avg_sentence_length = sum(len(word_tokenize(sentence)) for sentence in sentences) / len(sentences)
    
    # Calculate complex words
    complex_word_count = count_complex_words(text)
    percentage_complex_words = complex_word_count / word_count * 100
    
    # Calculate fog index
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    
    # Calculate syllables per word
    syllables_per_word = sum(count_syllables(word) for word in words) / word_count
    
    # Calculate personal pronouns
    personal_pronouns = len(re.findall(r'\b(I|we|my|ours|us)\b', text, re.I))
    
    ## average word length
    avg_word_length = sum(len(word) for word in words) / word_count
    
    ## polarity and subjectivity
    blob = TextBlob(text)
    polarity_score = blob.sentiment.polarity
    subjectivity_score = blob.sentiment.subjectivity
    
    ## Positive Negative
    positive_score = sum(blob.sentiment.polarity > 0 for word in words)
    negative_score = sum(blob.sentiment.polarity < 0 for word in words)
    
    return {
        "POSITIVE_SCORE": positive_score,
        "NEGATIVE_SCORE": negative_score,
        "POLARITY_SCORE": polarity_score,
        "SUBJECTIVITY_SCORE": subjectivity_score,
        "AVG_SENTENCE_LENGTH": avg_sentence_length,
        "PERCENTAGE_OF_COMPLEX_WORDS": percentage_complex_words,
        "FOG_INDEX": fog_index,
        "AVG_NUMBER_OF_WORDS_PER_SENTENCE": avg_sentence_length,
        "COMPLEX_WORD_COUNT": complex_word_count,
        "WORD_COUNT": word_count,
        "SYLLABLE_PER_WORD": syllables_per_word,
        "PERSONAL_PRONOUNS": personal_pronouns,
        "AVG_WORD_LENGTH": avg_word_length
    }





In [None]:
## File for analysis
analysis_data = pd.read_csv("C:/Users/Ankit Rai/OneDrive/Desktop.csv")

## text analysis to each extracted text
analysis_results = analysis_data['Extracted_Text'].apply(analyze_text)
analysis_results_df = pd.DataFrame(analysis_results.tolist())

# Final results with URL_ID
final_results = pd.concat([analysis_data[['URL_ID']], analysis_results_df], axis=1)

In [26]:
final_results.to_csv("C:/Users/Ankit Rai/OneDrive/Final_Result.csv")
