In [1]:

#Importing libraries
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
import warnings
warnings.filterwarnings("ignore")

In [2]:
#Uploading data from drive
input_urls= pd.read_excel('/content/drive/MyDrive/Black Coffer Text Analysis/Input.xlsx')

In [3]:
input_urls.head()

Unnamed: 0,URL_ID,URL
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...


In [4]:
len(input_urls['URL'])

100

# **Web Scrapping using Beautiful Soup**

In [5]:

pages = len(input_urls['URL']) #total number of urls
articles = []  # List to store articles
titles = []    # List to store titles
unsuccessful_urls = []  # List to track URLs that didn't yield both an article and a title

# List to store records for the final DataFrame
records = []

for i in range(pages):
    selected_url = input_urls['URL'][i]
    url_id = input_urls['URL_ID'][i]
    print(f'Scraping URL : {i + 1}/{pages} - {selected_url}')

    try:
        # Send an HTTP request to the selected URL
        response = requests.get(selected_url)
        response.raise_for_status()  # Raise an HTTPError for bad responses (e.g., 404, 500)

        # Parse the content of the page with BeautifulSoup
        parsed_content = BeautifulSoup(response.content, 'html.parser')
        article_found = False
        title_found = False

        # Find article content
        article_content_div = parsed_content.find('div', class_='td-post-content tagdiv-type')

        if article_content_div:
            # Extract text from the selected div element, excluding 'wp-block-preformatted'
            article_text = ' '.join(element.get_text() for element in article_content_div.find_all(True) if 'wp-block-preformatted' not in (element.get('class') or []))
            if article_text.strip():  # Ensure the article_text is not empty
                article_found = True

        # Find title content
        title_content = parsed_content.find_all('h1', class_='entry-title')
        if title_content:
            # Join all the text content found in the selected h1 elements
            title_text = ' '.join(title.get_text() for title in title_content)
            if title_text.strip():  # Ensure the title_text is not empty
                title_found = True

        # If either article or title is not found, try alternate classes and log the URL if still unsuccessful
        extracted_text_list = []
        if not article_found or not title_found:
            print(f'Tags didn\'t match, this URL has a picture: {selected_url}. Trying alternative tags.')

            article_content_pic = parsed_content.find_all('div', class_='tdb-block-inner td-fix-index')

            for a in article_content_pic:
                tags = a.find_all(['p', 'li', 'h2'])
                for tag in tags:
                    if 'wp-block-preformatted' not in (tag.get('class') or []):
                        extracted_text_list.append(tag.get_text())

            if extracted_text_list:
                article_text = ' '.join(extracted_text_list)
                if article_text.strip():  # Ensure the article_text is not empty
                    article_found = True

            title_content_pic = parsed_content.find_all('h1', class_='tdb-title-text')
            if title_content_pic:
                title_text = ' '.join(title.get_text() for title in title_content_pic)
                if title_text.strip():  # Ensure the title_text is not empty
                    title_found = True

        if article_found or title_found:
            # Add the record to the list if either article or title was found
            records.append({'URL_ID': url_id, 'URL': selected_url, 'Title': title_text if title_found else None, 'Article': article_text if article_found else None})
        else:
            unsuccessful_urls.append(selected_url)
            print(f'Failed to scrape URL: {selected_url}')
            records.append({'URL_ID': url_id, 'URL': selected_url, 'Title': None, 'Article': None})

    except requests.exceptions.RequestException as e:
        # Catch any request-related exceptions and log the URL as unsuccessful
        unsuccessful_urls.append(selected_url)
        print(f'Error scraping URL {selected_url}: {e}')
        records.append({'URL_ID': url_id, 'URL': selected_url, 'Title': None, 'Article': None})

# Convert the records list to a DataFrame
df = pd.DataFrame(records)

# Print the results
print(f'Scraping Complete for {len(articles)} articles out of {pages} pages.')

if unsuccessful_urls:
    print('The following URLs had Error code 404:')
    for url in unsuccessful_urls:
        print(url)
else:
    print('All URLs were successfully scraped.')


Scraping URL : 1/100 - https://insights.blackcoffer.com/rising-it-cities-and-its-impact-on-the-economy-environment-infrastructure-and-city-life-by-the-year-2040-2/
Scraping URL : 2/100 - https://insights.blackcoffer.com/rising-it-cities-and-their-impact-on-the-economy-environment-infrastructure-and-city-life-in-future/
Scraping URL : 3/100 - https://insights.blackcoffer.com/internet-demands-evolution-communication-impact-and-2035s-alternative-pathways/
Scraping URL : 4/100 - https://insights.blackcoffer.com/rise-of-cybercrime-and-its-effect-in-upcoming-future/
Scraping URL : 5/100 - https://insights.blackcoffer.com/ott-platform-and-its-impact-on-the-entertainment-industry-in-future/
Scraping URL : 6/100 - https://insights.blackcoffer.com/the-rise-of-the-ott-platform-and-its-impact-on-the-entertainment-industry-by-2040/
Scraping URL : 7/100 - https://insights.blackcoffer.com/rise-of-cyber-crime-and-its-effects/
Scraping URL : 8/100 - https://insights.blackcoffer.com/rise-of-internet-dem

In [6]:
#URLs with 404 error, also manually verified
unsuccessful_urls

['https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/',
 'https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/']

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   URL_ID   100 non-null    object
 1   URL      100 non-null    object
 2   Title    98 non-null     object
 3   Article  98 non-null     object
dtypes: object(4)
memory usage: 3.2+ KB


In [8]:
df.head()

Unnamed: 0,URL_ID,URL,Title,Article
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,Rising IT cities and its impact on the economy...,We have seen a huge development and dependence...
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,Rising IT Cities and Their Impact on the Econo...,"Throughout history, from the industrial revolu..."
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...,"Internet Demand’s Evolution, Communication Imp...",Introduction Introduction In the span of just ...
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,Rise of Cybercrime and its Effect in upcoming ...,"The way we live, work, and communicate has unq..."
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...,OTT platform and its impact on the entertainme...,The year 2040 is poised to witness a continued...


In [9]:
#Randomly checking
df['Title'][49]

'Environmental impact of the COVID-19 pandemic – Lesson for the Future'

In [10]:
#Randomly checking
df['Article'][49]

'The Covid- 19 pandemics forced factories to shut down, flights getting canceled and a massive decrease in the global economy, with a significant decrease in Green House Gases (GHG) in many developed and developing countries. The SARS- CoV2 came into the spotlight in December 2019 and has impacted most of the countries till then. Nearly 131 million peoples were infected worldwide and resulting in deaths of around 2.9 Million according to World Health Organisation (WHO). Most of the countries dealt with the new virus by imposing strict lockdowns and social distancing to control the spread of the virus. These policies caused adverse effects worldwide. One of the most important impacts of the Covid-19 Pandemic is on the environment. There have been few positive impacts on the environment due to lockdown like, air pollution has decreased dramatically, as people were asked to stay in their houses due to the lockdowns. There has also been a sharp decline in environmental noise. Environmental

In [13]:
#Saving only articles in .txt file

# Convert the 'Article' column to a list
df['Article'].fillna('', inplace=True)
article_list = df['Article'].tolist()

# Write the list to a text file
with open('URL_ID.txt', 'w') as file:
    for article in article_list:
        file.write(article + '\n')

# **Sentiment Analysis**

In [14]:
import nltk

In [15]:
#Extracting stopwords from drive which is of .txt format

def read_text_file(file_path,empty_list):

    with open(file_path, 'r') as file:
      lines = file.readlines()

    for line in lines:
      empty_list.append(line.strip())

    return empty_list

In [16]:
stopwords_dates_numbers=[]
stopwords_currencies=[]
stopwords_geographic=[]
stopwords_generic_long=[]
stopwords_generic=[]
stopwords_names=[]
stopwords_auditor=[]

In [17]:
stopwords_auditor= read_text_file('/content/drive/MyDrive/Black Coffer Text Analysis/StopWords/StopWords_Auditor.txt',stopwords_auditor)
stopwords_dates_numbers= read_text_file('/content/drive/MyDrive/Black Coffer Text Analysis/StopWords/StopWords_DatesandNumbers.txt',stopwords_dates_numbers)
stopwords_currencies= read_text_file('/content/drive/MyDrive/Black Coffer Text Analysis/StopWords/StopWords_Currencies.txt',stopwords_currencies)
stopwords_generic= read_text_file('/content/drive/MyDrive/Black Coffer Text Analysis/StopWords/StopWords_Generic.txt',stopwords_generic)
stopwords_generic_long= read_text_file('/content/drive/MyDrive/Black Coffer Text Analysis/StopWords/StopWords_GenericLong.txt',stopwords_generic_long)
stopwords_names= read_text_file('/content/drive/MyDrive/Black Coffer Text Analysis/StopWords/StopWords_Names.txt',stopwords_names)
stopwords_geographic= read_text_file('/content/drive/MyDrive/Black Coffer Text Analysis/StopWords/StopWords_Geographic.txt',stopwords_geographic)

In [18]:
#Converting stopwords to respective formats
stopwords_currency=[]
stopwords_countries=[]
for word in stopwords_currencies:

  currency,countries=word.split(' | ')
  stopwords_currency.append(currency.strip())
  stopwords_countries.append(countries.strip())

In [19]:
#Making one list of all stopwords
total_stopwords= stopwords_auditor + stopwords_dates_numbers +stopwords_currencies +stopwords_generic +stopwords_generic_long +stopwords_names +stopwords_geographic + stopwords_countries

In [20]:
data=pd.DataFrame(df['Article'])

In [21]:
data.head()

Unnamed: 0,Article
0,We have seen a huge development and dependence...
1,"Throughout history, from the industrial revolu..."
2,Introduction Introduction In the span of just ...
3,"The way we live, work, and communicate has unq..."
4,The year 2040 is poised to witness a continued...


In [22]:
#Converting to lowercase and filling null value with empty string for easy computation
stopwords=[]

for word in total_stopwords:

  stopwords.append(word.lower())

data['Article'] = data['Article'].fillna('')
data['Article']=data['Article'].str.lower()

In [23]:
#Checking length before removing stopwords
len(data['Article'][10])

10476

In [24]:
#Removing stopwords

def removing_stopwords(text,stopwords):

  words= text.split()
  filtered_words=[word for word in words if word not in stopwords]
  filtered_text=' '.join(filtered_words)
  return filtered_text

In [25]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Article  100 non-null    object
dtypes: object(1)
memory usage: 928.0+ bytes


In [26]:
#Applying remove stopwords to all articles in dataframe
data['Article']=data['Article'].apply(lambda x: removing_stopwords(x,stopwords))

In [27]:
#Checking length after removing stopwords
len(data['Article'][10])

7678

In [28]:
#Extracting positive and negative words from master dictionary(drive)
positive_words=[]
negative_words=[]
positive_words = read_text_file('/content/drive/MyDrive/Black Coffer Text Analysis/MasterDictionary/positive-words.txt',positive_words)

file_path= '/content/drive/MyDrive/Black Coffer Text Analysis/MasterDictionary/negative-words.txt'
with open(file_path,'r',encoding='utf-8',errors='ignore') as file:
    lines = file.readlines()
    for line in lines:
        negative_words.append(line.strip())


In [29]:
data.head()

Unnamed: 0,Article
0,huge development dependence people technology ...
1,"history, industrial revolution 18th century de..."
2,"introduction introduction span decades, intern..."
3,"live, work, communicate unquestionably changed..."
4,2040 poised witness continued revolution world...


In [30]:
pip install textstat

Collecting textstat
  Downloading textstat-0.7.3-py3-none-any.whl (105 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyphen (from textstat)
  Downloading pyphen-0.15.0-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.15.0 textstat-0.7.3


In [32]:
import nltk #python library for text analysis
from nltk.tokenize import word_tokenize, sent_tokenize #for tokenizing
from nltk.corpus import stopwords #nltk stopwords
import string
import re #regular expression
import textstat #Readability measures

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

def calculate_scores(tokens, positive_words, negative_words):

    #Calculating positive and negative scores
    positive_score = sum(1 for word in tokens if word in positive_words)
    negative_score = sum(1 for word in tokens if word in negative_words)

    # Calculate Polarity Score
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)

    # Calculate Subjectivity Score
    subjectivity_score = (positive_score + negative_score) / (len(tokens) + 0.000001)

    return positive_score, negative_score, polarity_score, subjectivity_score

#Calculating count of complex words
def count_complex_words(words):
    complex_word_count = 0
    for word in words:
        syllables = textstat.syllable_count(word)
        if syllables >= 2:
            complex_word_count += 1
    return complex_word_count

#calculating avg sentence length, percentage of complex words and fog index

def readability_metrics(text):
    # Tokenize and clean words
    words = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    cleaned_words = [word for word in words if word.isalnum() and word not in stop_words]

    # Calculate readability metrics
    average_sentence_length = textstat.avg_sentence_length(text)
    fog_index = textstat.gunning_fog(text)

    total_words = len(cleaned_words)
    complex_word_count = count_complex_words(cleaned_words)
    percentage_complex_words = (complex_word_count / total_words) * 100 if total_words > 0 else 0

    return average_sentence_length, percentage_complex_words, fog_index

#Word count after removing nltk stopwords and punctuations

def clean_and_count_words(text):
    # Tokenize and clean words
    words = word_tokenize(text.lower())

    # Remove stopwords and punctuation
    stop_words = set(stopwords.words('english'))
    cleaned_words = [word for word in words if word.isalnum() and word not in stop_words and word not in string.punctuation]

    # Count cleaned words
    word_count = len(cleaned_words)

    return word_count

#Calculating words present per sentence
def calculate_words_present_per_sentence(text):
    # Tokenize sentences and words
    sentences = sent_tokenize(text)
    words = word_tokenize(text.lower())

    # Calculate average number of words per sentence
    total_words = len(words)
    total_sentences = len(sentences)
    average_words_per_sentence = total_words / total_sentences if total_sentences > 0 else 0

    return average_words_per_sentence


#Number of syllables
def count_syllables_in_word(word):
    word = word.lower()
    vowels = "aeiou"
    count = 0
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith("es") or word.endswith("ed"):
        count -= 1
    return max(1, count)  # Ensure at least one syllable
#Iterarting syllable count for multiple words
def calculate_syllable_count(text):
    words = word_tokenize(text.lower())
    syllable_count = sum(count_syllables_in_word(word) for word in words if word.isalnum())
    return syllable_count

#Counting personal pronouns
def count_personal_pronouns(text):
    # Define list of personal pronouns
    personal_pronouns = ["i", "we", "my", "ours", "us"]

    # Use regex to find counts of personal pronouns
    regex_pattern = r'\b(?:{})\b'.format('|'.join(personal_pronouns))
    matches = re.findall(regex_pattern, text.lower())
    personal_pronoun_count = len(matches)

    return personal_pronoun_count

#Calculating average word lenth
def calculate_average_word_length(text):
    words = word_tokenize(text.lower())
    total_characters = sum(len(word) for word in words if word.isalnum())
    total_words = len(words)
    average_word_length = total_characters / total_words if total_words > 0 else 0
    return average_word_length

#Defining a function which performs all text analysis and returns a dictionary
def text_analysis(articles):
    analysis_results = []

    for text in articles:
        # Calculate sentiment scores
        tokens = word_tokenize(text.lower())
        positive_score, negative_score, polarity_score, subjectivity_score = calculate_scores(tokens, positive_words, negative_words)

        # Calculate complex word count
        complex_word_count = count_complex_words(tokens)

        # Calculate readability metrics
        average_sentence_length, percentage_complex_words, fog_index = readability_metrics(text)

        # Count personal pronouns
        personal_pronoun_count = count_personal_pronouns(text)

        # Calculate word count
        word_count = clean_and_count_words(text)

        # Calculate average sentence length
        avg_no_of_words_per_sentence = calculate_words_present_per_sentence(text)

        # Calculate syllable count
        syllable_count = calculate_syllable_count(text)

        # Calculate average word length
        avg_word_length = calculate_average_word_length(text)

        # Compile analysis results for the article
        article_analysis = {
            "Positive Score": positive_score,
            "Negative Score": negative_score,
            "Polarity Score": polarity_score,
            "Subjectivity Score": subjectivity_score,
            "Average Sentence Length": average_sentence_length,
            "Average no of words per sentence": avg_no_of_words_per_sentence,
            "Percentage of Complex Words": percentage_complex_words,
            "Fog Index": fog_index,
            "Complex Word Count": complex_word_count,
            "Word Count": word_count,
            "Syllables per Word": syllable_count,
            "Personal Pronouns": personal_pronoun_count,
            "Average Word Length": avg_word_length
        }

        analysis_results.append(article_analysis)

    return analysis_results



# Convert DataFrame column to a list (Series) of articles
articles_series = data['Article']

# Ensure each article in articles_series is a string
assert all(isinstance(article, str) for article in articles_series)

# Perform text analysis
results = text_analysis(articles_series)




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [33]:
answer= pd.DataFrame(results)

In [34]:
answer.head()

Unnamed: 0,Positive Score,Negative Score,Polarity Score,Subjectivity Score,Average Sentence Length,Average no of words per sentence,Percentage of Complex Words,Fog Index,Complex Word Count,Word Count,Syllables per Word,Personal Pronouns,Average Word Length
0,50,13,0.587302,0.050971,7.6,11.660377,71.540179,5.51,643,896,1938,1,4.980583
1,57,32,0.280899,0.086914,9.4,12.641975,71.54047,13.06,563,766,1885,2,5.658203
2,36,23,0.220339,0.075544,11.2,13.946429,84.56486,13.84,529,609,1680,0,6.50064
3,38,79,-0.350427,0.14717,12.3,17.282609,83.387622,15.77,519,614,1615,0,6.334591
4,19,8,0.407407,0.064904,8.9,10.947368,78.193146,14.01,265,321,765,0,5.963942


In [35]:
answer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 13 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Positive Score                    100 non-null    int64  
 1   Negative Score                    100 non-null    int64  
 2   Polarity Score                    100 non-null    float64
 3   Subjectivity Score                100 non-null    float64
 4   Average Sentence Length           100 non-null    float64
 5   Average no of words per sentence  100 non-null    float64
 6   Percentage of Complex Words       100 non-null    float64
 7   Fog Index                         100 non-null    float64
 8   Complex Word Count                100 non-null    int64  
 9   Word Count                        100 non-null    int64  
 10  Syllables per Word                100 non-null    int64  
 11  Personal Pronouns                 100 non-null    int64  
 12  Average W

In [36]:
#Creating a final output dataframe and excel file
final_df= pd.concat([input_urls,answer], axis=1)


In [37]:
final_df.head()

Unnamed: 0,URL_ID,URL,Positive Score,Negative Score,Polarity Score,Subjectivity Score,Average Sentence Length,Average no of words per sentence,Percentage of Complex Words,Fog Index,Complex Word Count,Word Count,Syllables per Word,Personal Pronouns,Average Word Length
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,50,13,0.587302,0.050971,7.6,11.660377,71.540179,5.51,643,896,1938,1,4.980583
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,57,32,0.280899,0.086914,9.4,12.641975,71.54047,13.06,563,766,1885,2,5.658203
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...,36,23,0.220339,0.075544,11.2,13.946429,84.56486,13.84,529,609,1680,0,6.50064
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,38,79,-0.350427,0.14717,12.3,17.282609,83.387622,15.77,519,614,1615,0,6.334591
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...,19,8,0.407407,0.064904,8.9,10.947368,78.193146,14.01,265,321,765,0,5.963942


In [39]:
pip install pandas openpyxl




In [42]:
output= final_df.to_excel('output.xlsx',index=False)