In [1]:
pip install openpyxl


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.1.1
[notice] To update, run: C:\Users\Admin\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


## STEP 1:- To extract data from the URLs

### the extraction of article titles and contents from URLs listed in an Excel file.
### It handles HTTP requests, HTML parsing, file I/O operations, and error handling to ensure robustness in extracting and saving content.

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

# Function to extract article title and text from a given URL
def extract_article_text(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Extract article title
            title_element = soup.find('h1', class_='title')
            title = title_element.text.strip() if title_element else "No Title Found"
            
            # Extract article content
            article_content = soup.find('div', class_='td-post-content tagdiv-type')
            if article_content:
                paragraphs = article_content.find_all('p')
                article_text = '\n'.join([p.text.strip() for p in paragraphs])
            else:
                article_text = "No Content Found"
            
            return title, article_text
        else:
            print(f"Failed to retrieve {url}. Status code: {response.status_code}")
            return None, None
    except Exception as e:
        print(f"Exception occurred while processing {url}: {str(e)}")
        return None, None

# Load Excel file
excel_file = 'input.xlsx'
try:
    df = pd.read_excel(excel_file, engine='openpyxl')
except Exception as e:
    print(f"Error reading {excel_file}: {str(e)}")
    exit()

# Iterate over each row in the dataframe
for index, row in df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    
    # Extract article content
    title, article_text = extract_article_text(url)
    
    if title and article_text:
        # Create directory to store text files if it doesn't exist
        output_dir = 'extracted_articles'
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        
        # Write extracted content to a text file
        filename = f"{output_dir}/{url_id}.txt"
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(f"{title}\n\n{article_text}")
        
        print(f"Extracted and saved article from {url} to {filename}")
    else:
        print(f"Skipping {url} due to extraction issues")


Extracted and saved article from https://insights.blackcoffer.com/rising-it-cities-and-its-impact-on-the-economy-environment-infrastructure-and-city-life-by-the-year-2040-2/ to extracted_articles/blackassign0001.txt
Extracted and saved article from https://insights.blackcoffer.com/rising-it-cities-and-their-impact-on-the-economy-environment-infrastructure-and-city-life-in-future/ to extracted_articles/blackassign0002.txt
Extracted and saved article from https://insights.blackcoffer.com/internet-demands-evolution-communication-impact-and-2035s-alternative-pathways/ to extracted_articles/blackassign0003.txt
Extracted and saved article from https://insights.blackcoffer.com/rise-of-cybercrime-and-its-effect-in-upcoming-future/ to extracted_articles/blackassign0004.txt
Extracted and saved article from https://insights.blackcoffer.com/ott-platform-and-its-impact-on-the-entertainment-industry-in-future/ to extracted_articles/blackassign0005.txt
Extracted and saved article from https://insight

# Note:
## Jupyter/Excel S.No - blackassign0035.txt 
### Failed to retrieve https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/. Status code: 404
### Skipping https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/ due to extraction issues

## Jupyter/Excel S.No - blackassign0049.txt 
### Failed to retrieve https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/. Status code: 404
### Skipping https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/ due to extraction issues

## Except above 2 links, I extracted article title and article text from 98 links out of 100 links

## Step 2 :- To know the encoding of negative words

In [3]:
pip install chardet


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.1.1
[notice] To update, run: C:\Users\Admin\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [4]:
import chardet

def detect_encoding(filename):
    with open(filename, 'rb') as f:
        raw_data = f.read()
        result = chardet.detect(raw_data)
    return result['encoding']

# Example usage
filename = 'F:/Python/Task/BlackCoffer/MasterDictionary/negative-words.txt'  # file path
encoding = detect_encoding(filename)
print(f"The encoding of {filename} is: {encoding}")


The encoding of F:/Python/Task/BlackCoffer/MasterDictionary/negative-words.txt is: ISO-8859-1


### The below provided code snippet demonstrates a straightforward approach to read and process words from specified text files (positive-words.txt and negative-words.txt). It uses a function read_words_from_file to read each file, strip whitespace, and convert the resulting list into a set of unique words. Finally, it prints the loaded positive and negative words sets. This functionality is commonly used in natural language processing tasks such as sentiment analysis, where positive and negative word lists are essential for determining sentiment in text data.

In [5]:
def read_words_from_file(filename):
    with open(filename, 'r', encoding='ISO-8859-1') as f:
        words = [line.strip() for line in f.readlines()]
    return set(words)

# Load positive and negative words
positive_words = read_words_from_file("F:/Python/Task/BlackCoffer/MasterDictionary/positive-words.txt")
negative_words = read_words_from_file("F:/Python/Task/BlackCoffer/MasterDictionary/negative-words.txt")

print("Positive Words:", positive_words)
print("Negative Words:", negative_words)

Positive Words: {'outdo', 'adaptive', 'fantastically', 'evocative', 'sparkling', 'non-violent', 'wisdom', 'wowed', 'well-connected', 'unconditional', 'worthy', 'rapture', 'refreshed', 'fanfare', 'hallmark', 'applaud', 'swankier', 'gratify', 'ecstatically', 'contentment', 'elegance', 'gratifyingly', 'impartially', 'remarkably', 'happiness', 'rejoice', 'wealthy', 'steadfastly', 'enhance', 'instructive', 'inventive', 'keenly', 'gaiety', 'respite', 'exuberance', 'supple', 'low-price', 'accolade', 'faster', 'unmatched', 'adored', 'admirable', 'brilliantly', 'intrigue', 'greatest', 'champ', 'triumphantly', 'leverage', 'self-satisfaction', 'headway', 'gifted', 'resounding', 'gratification', 'sincerely', 'cozy', 'reaffirmation', 'examplary', 'hardier', 'delighted', 'ingenuous', 'brand-new', 'hearten', 'maturity', 'stabilize', 'bravery', 'fondness', 'solace', 'gleefully', 'grin', 'positives', 'illuminate', 'magnanimous', 'patriotic', 'affirmative', 'undamaged', 'carefree', 'chic', 'charismatic'

In [16]:
import os
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import string
import pandas as pd
import re
# Create directory if it doesn't exist
directory = r'F:\Python\Task\BlackCoffer\extracted_articles'
if not os.path.exists(directory):
    os.makedirs(directory)

# Load stopwords and punctuation
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)

# Function to clean and tokenize text
def clean_and_tokenize(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words and word not in punctuation]
    return tokens

def read_words_from_file(filename):
    with open(filename, 'r', encoding='ISO-8859-1') as f:
        words = [line.strip() for line in f.readlines()]
    return set(words)

# Load positive and negative words
positive_words = read_words_from_file("F:/Python/Task/BlackCoffer/MasterDictionary/positive-words.txt")
negative_words = read_words_from_file("F:/Python/Task/BlackCoffer/MasterDictionary/negative-words.txt")

# Function to count syllables in a word
def count_syllables(word):
    vowels = 'aeiouy'
    syllables = 0
    last_char_was_vowel = False
    for char in word:
        if char.lower() in vowels:
            if not last_char_was_vowel:
                syllables += 1
            last_char_was_vowel = True
        else:
            last_char_was_vowel = False
    if word.endswith(('es', 'ed')):
        syllables -= 1
    if len(word) > 2 and word[-2:] == 'es' and word[-3] not in vowels:
        syllables += 1
    return syllables

# Define paths
folder_path = r'F:\Python\Task\BlackCoffer\extracted_articles'

# Define dictionary mapping filenames to URLs or links
file_urls = {
    'blackassign0001': 'https://insights.blackcoffer.com/rising-it-cities-and-its-impact-on-the-economy-environment-infrastructure-and-city-life-by-the-year-2040-2/',
'blackassign0002': 'https://insights.blackcoffer.com/rising-it-cities-and-their-impact-on-the-economy-environment-infrastructure-and-city-life-in-future/',
'blackassign0003': 'https://insights.blackcoffer.com/internet-demands-evolution-communication-impact-and-2035s-alternative-pathways/',
'blackassign0004': 'https://insights.blackcoffer.com/rise-of-cybercrime-and-its-effect-in-upcoming-future/',
'blackassign0005': 'https://insights.blackcoffer.com/ott-platform-and-its-impact-on-the-entertainment-industry-in-future/',
'blackassign0006': 'https://insights.blackcoffer.com/the-rise-of-the-ott-platform-and-its-impact-on-the-entertainment-industry-by-2040/',
'blackassign0007': 'https://insights.blackcoffer.com/rise-of-cyber-crime-and-its-effects/',
'blackassign0008': 'https://insights.blackcoffer.com/rise-of-internet-demand-and-its-impact-on-communications-and-alternatives-by-the-year-2035-2/',
'blackassign0009': 'https://insights.blackcoffer.com/rise-of-cybercrime-and-its-effect-by-the-year-2040-2/',
'blackassign0010': 'https://insights.blackcoffer.com/rise-of-cybercrime-and-its-effect-by-the-year-2040/',
'blackassign0011': 'https://insights.blackcoffer.com/rise-of-internet-demand-and-its-impact-on-communications-and-alternatives-by-the-year-2035/',
'blackassign0012': 'https://insights.blackcoffer.com/rise-of-telemedicine-and-its-impact-on-livelihood-by-2040-3-2/',
'blackassign0013': 'https://insights.blackcoffer.com/rise-of-e-health-and-its-impact-on-humans-by-the-year-2030/',
'blackassign0014': 'https://insights.blackcoffer.com/rise-of-e-health-and-its-imapct-on-humans-by-the-year-2030-2/',
'blackassign0015': 'https://insights.blackcoffer.com/rise-of-telemedicine-and-its-impact-on-livelihood-by-2040-2/',
'blackassign0016': 'https://insights.blackcoffer.com/rise-of-telemedicine-and-its-impact-on-livelihood-by-2040-2-2/',
'blackassign0017': 'https://insights.blackcoffer.com/rise-of-chatbots-and-its-impact-on-customer-support-by-the-year-2040/',
'blackassign0018': 'https://insights.blackcoffer.com/rise-of-e-health-and-its-imapct-on-humans-by-the-year-2030/',
'blackassign0019': 'https://insights.blackcoffer.com/how-does-marketing-influence-businesses-and-consumers/',
'blackassign0020': 'https://insights.blackcoffer.com/how-advertisement-increase-your-market-value/',
'blackassign0021': 'https://insights.blackcoffer.com/negative-effects-of-marketing-on-society/',
'blackassign0022': 'https://insights.blackcoffer.com/how-advertisement-marketing-affects-business/',
'blackassign0023': 'https://insights.blackcoffer.com/rising-it-cities-will-impact-the-economy-environment-infrastructure-and-city-life-by-the-year-2035/',
'blackassign0024': 'https://insights.blackcoffer.com/rise-of-ott-platform-and-its-impact-on-entertainment-industry-by-the-year-2030/',
'blackassign0025': 'https://insights.blackcoffer.com/rise-of-electric-vehicles-and-its-impact-on-livelihood-by-2040/',
'blackassign0026': 'https://insights.blackcoffer.com/rise-of-electric-vehicle-and-its-impact-on-livelihood-by-the-year-2040/',
'blackassign0027': 'https://insights.blackcoffer.com/oil-prices-by-the-year-2040-and-how-it-will-impact-the-world-economy/',
'blackassign0028': 'https://insights.blackcoffer.com/an-outlook-of-healthcare-by-the-year-2040-and-how-it-will-impact-human-lives/',
'blackassign0029': 'https://insights.blackcoffer.com/ai-in-healthcare-to-improve-patient-outcomes/',
'blackassign0030': 'https://insights.blackcoffer.com/what-if-the-creation-is-taking-over-the-creator/',
'blackassign0031': 'https://insights.blackcoffer.com/what-jobs-will-robots-take-from-humans-in-the-future/',
'blackassign0032': 'https://insights.blackcoffer.com/will-machine-replace-the-human-in-the-future-of-work/',
'blackassign0033': 'https://insights.blackcoffer.com/will-ai-replace-us-or-work-with-us/',
'blackassign0034': 'https://insights.blackcoffer.com/man-and-machines-together-machines-are-more-diligent-than-humans-blackcoffe/',
'blackassign0035': 'https://insights.blackcoffer.com/in-future-or-in-upcoming-years-humans-and-machines-are-going-to-work-together-in-every-field-of-work/',
    
'blackassign0037': 'https://insights.blackcoffer.com/how-machine-learning-will-affect-your-business/',
'blackassign0038': 'https://insights.blackcoffer.com/deep-learning-impact-on-areas-of-e-learning/',
'blackassign0039': 'https://insights.blackcoffer.com/how-to-protect-future-data-and-its-privacy-blackcoffer/',
'blackassign0040': 'https://insights.blackcoffer.com/how-machines-ai-automations-and-robo-human-are-effective-in-finance-and-banking/',
'blackassign0041': 'https://insights.blackcoffer.com/ai-human-robotics-machine-future-planet-blackcoffer-thinking-jobs-workplace/',
'blackassign0042': 'https://insights.blackcoffer.com/how-ai-will-change-the-world-blackcoffer/',
'blackassign0043': 'https://insights.blackcoffer.com/future-of-work-how-ai-has-entered-the-workplace/',
'blackassign0044': 'https://insights.blackcoffer.com/ai-tool-alexa-google-assistant-finance-banking-tool-future/',
'blackassign0045': 'https://insights.blackcoffer.com/ai-healthcare-revolution-ml-technology-algorithm-google-analytics-industrialrevolution/',
'blackassign0046': 'https://insights.blackcoffer.com/all-you-need-to-know-about-online-marketing/',
'blackassign0047': 'https://insights.blackcoffer.com/evolution-of-advertising-industry/',
'blackassign0048': 'https://insights.blackcoffer.com/how-data-analytics-can-help-your-business-respond-to-the-impact-of-covid-19/',

'blackassign0050': 'https://insights.blackcoffer.com/environmental-impact-of-the-covid-19-pandemic-lesson-for-the-future/',
'blackassign0051': 'https://insights.blackcoffer.com/how-data-analytics-and-ai-are-used-to-halt-the-covid-19-pandemic/',
'blackassign0052': 'https://insights.blackcoffer.com/difference-between-artificial-intelligence-machine-learning-statistics-and-data-mining/',
'blackassign0053': 'https://insights.blackcoffer.com/how-python-became-the-first-choice-for-data-science/',
'blackassign0054': 'https://insights.blackcoffer.com/how-google-fit-measure-heart-and-respiratory-rates-using-a-phone/',
'blackassign0055': 'https://insights.blackcoffer.com/what-is-the-future-of-mobile-apps/',
'blackassign0056': 'https://insights.blackcoffer.com/impact-of-ai-in-health-and-medicine/',
'blackassign0057': 'https://insights.blackcoffer.com/telemedicine-what-patients-like-and-dislike-about-it/',
'blackassign0058': 'https://insights.blackcoffer.com/how-we-forecast-future-technologies/',
'blackassign0059': 'https://insights.blackcoffer.com/can-robots-tackle-late-life-loneliness/',
'blackassign0060': 'https://insights.blackcoffer.com/embedding-care-robots-into-society-socio-technical-considerations/',
'blackassign0061': 'https://insights.blackcoffer.com/management-challenges-for-future-digitalization-of-healthcare-services/',
'blackassign0062': 'https://insights.blackcoffer.com/are-we-any-closer-to-preventing-a-nuclear-holocaust/',
'blackassign0063': 'https://insights.blackcoffer.com/will-technology-eliminate-the-need-for-animal-testing-in-drug-development/',
'blackassign0064': 'https://insights.blackcoffer.com/will-we-ever-understand-the-nature-of-consciousness/',
'blackassign0065': 'https://insights.blackcoffer.com/will-we-ever-colonize-outer-space/',
'blackassign0066': 'https://insights.blackcoffer.com/what-is-the-chance-homo-sapiens-will-survive-for-the-next-500-years/',
'blackassign0067': 'https://insights.blackcoffer.com/why-does-your-business-need-a-chatbot/',
'blackassign0068': 'https://insights.blackcoffer.com/how-you-lead-a-project-or-a-team-without-any-technical-expertise/',
'blackassign0069': 'https://insights.blackcoffer.com/can-you-be-great-leader-without-technical-expertise/',
'blackassign0070': 'https://insights.blackcoffer.com/how-does-artificial-intelligence-affect-the-environment/',
'blackassign0071': 'https://insights.blackcoffer.com/how-to-overcome-your-fear-of-making-mistakes-2/',
'blackassign0072': 'https://insights.blackcoffer.com/is-perfection-the-greatest-enemy-of-productivity/',
'blackassign0073': 'https://insights.blackcoffer.com/global-financial-crisis-2008-causes-effects-and-its-solution/',
'blackassign0074': 'https://insights.blackcoffer.com/gender-diversity-and-equality-in-the-tech-industry/',
'blackassign0075': 'https://insights.blackcoffer.com/how-to-overcome-your-fear-of-making-mistakes/',
'blackassign0076': 'https://insights.blackcoffer.com/how-small-business-can-survive-the-coronavirus-crisis/',
'blackassign0077': 'https://insights.blackcoffer.com/impacts-of-covid-19-on-vegetable-vendors-and-food-stalls/',
'blackassign0078': 'https://insights.blackcoffer.com/impacts-of-covid-19-on-vegetable-vendors/',
'blackassign0079': 'https://insights.blackcoffer.com/impact-of-covid-19-pandemic-on-tourism-aviation-industries/',
'blackassign0080': 'https://insights.blackcoffer.com/impact-of-covid-19-pandemic-on-sports-events-around-the-world/',
'blackassign0081': 'https://insights.blackcoffer.com/changing-landscape-and-emerging-trends-in-the-indian-it-ites-industry/',
'blackassign0082': 'https://insights.blackcoffer.com/online-gaming-adolescent-online-gaming-effects-demotivated-depression-musculoskeletal-and-psychosomatic-symptoms/',
'blackassign0083': 'https://insights.blackcoffer.com/human-rights-outlook/',
'blackassign0084': 'https://insights.blackcoffer.com/how-voice-search-makes-your-business-a-successful-business/',
'blackassign0085': 'https://insights.blackcoffer.com/how-the-covid-19-crisis-is-redefining-jobs-and-services/',
'blackassign0086': 'https://insights.blackcoffer.com/how-to-increase-social-media-engagement-for-marketers/',
'blackassign0087': 'https://insights.blackcoffer.com/impacts-of-covid-19-on-streets-sides-food-stalls/',
'blackassign0088': 'https://insights.blackcoffer.com/coronavirus-impact-on-energy-markets-2/',
'blackassign0089': 'https://insights.blackcoffer.com/coronavirus-impact-on-the-hospitality-industry-5/',
'blackassign0090': 'https://insights.blackcoffer.com/lessons-from-the-past-some-key-learnings-relevant-to-the-coronavirus-crisis-4/',
'blackassign0091': 'https://insights.blackcoffer.com/estimating-the-impact-of-covid-19-on-the-world-of-work-2/',
'blackassign0092': 'https://insights.blackcoffer.com/estimating-the-impact-of-covid-19-on-the-world-of-work-3/',
'blackassign0093': 'https://insights.blackcoffer.com/travel-and-tourism-outlook/',
'blackassign0094': 'https://insights.blackcoffer.com/gaming-disorder-and-effects-of-gaming-on-health/',
'blackassign0095': 'https://insights.blackcoffer.com/what-is-the-repercussion-of-the-environment-due-to-the-covid-19-pandemic-situation/',
'blackassign0096': 'https://insights.blackcoffer.com/what-is-the-repercussion-of-the-environment-due-to-the-covid-19-pandemic-situation-2/',
'blackassign0097': 'https://insights.blackcoffer.com/impact-of-covid-19-pandemic-on-office-space-and-co-working-industries/',
'blackassign0098': 'https://insights.blackcoffer.com/contribution-of-handicrafts-visual-arts-literature-in-the-indian-economy/',
'blackassign0099': 'https://insights.blackcoffer.com/how-covid-19-is-impacting-payment-preferences/',
'blackassign0100': 'https://insights.blackcoffer.com/how-will-covid-19-affect-the-world-of-work-2/'
}

# Function to count personal pronouns
def count_personal_pronouns(text):
    pronouns = r'\b(?:I|we|my|ours|us)\b'
    # Exclude 'US' as a country name
    exclude_us_country = r'\b(?!US)\b'
    pattern = re.compile(pronouns, flags=re.IGNORECASE)
    matches = re.findall(pattern, text)
    return len(matches)

# Process each text file in the folder
results = []

for filename, url in file_urls.items():
    file_path = os.path.join(folder_path, filename + '.txt')
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            
            # Clean and tokenize
            tokens = clean_and_tokenize(text)
            total_words = len(tokens)
            
            # Calculate derived variables
            positive_score = sum(1 for word in tokens if word in positive_words)
            negative_score = -1 * sum(1 for word in tokens if word in negative_words)
            polarity_score = (positive_score - negative_score) / (positive_score + negative_score + 0.000001)
            subjectivity_score = (positive_score + negative_score) / (total_words + 0.000001)
            
            # Calculate readability metrics
            sentences = sent_tokenize(text)
            total_sentences = len(sentences)
            average_sentence_length = total_words / total_sentences
            complex_words = [word for word in tokens if count_syllables(word) > 2]
            percentage_complex_words = (len(complex_words) / total_words) * 100
            fog_index = 0.4 * (average_sentence_length + percentage_complex_words)
            
            # Calculate average number of words per sentence
            average_words_per_sentence = total_words / total_sentences
            
            # Count personal pronouns
            personal_pronoun_count = count_personal_pronouns(text)
            
            # Calculate average word length
            total_characters = sum(len(word) for word in tokens)
            average_word_length = total_characters / total_words

            # Append results to list
            results.append({
                'URL_ID': filename,
                'URL': url,
                'POSITIVE SCORE': positive_score,
                'NEGATIVE SCORE': negative_score,
                'POLARITY SCORE': polarity_score,
                'SUBJECTIVITY SCORE': subjectivity_score,
                'AVG SENTENCE LENGTH': average_sentence_length,
                'PERCENTAGE OF COMPLEX WORDS': percentage_complex_words,
                'FOG INDEX': fog_index,
                'AVG NUMBER OF WORDS PER SENTENCE': average_words_per_sentence,
                'COMPLEX WORD COUNT': len(complex_words),
                'WORD COUNT': total_words,
                'SYLLABLE PER WORD': [count_syllables(word) for word in tokens],
                'PERSONAL PRONOUNS': personal_pronoun_count,
                'AVG WORD LENGTH': average_word_length,
                
            })
    
    except FileNotFoundError:
        print(f"File '{filename}' not found. Skipping...")

# Convert results to DataFrame
df = pd.DataFrame(results)
# Export to Excel
output_excel_path = 'analysis_results.xlsx'  # Adjust path as needed
df.to_excel(output_excel_path, index=False)

print(f"Analysis results saved to '{output_excel_path}'")


Analysis results saved to 'analysis_results.xlsx'
