In [21]:
import os
import pandas as pd
from bs4 import BeautifulSoup
import requests
import nltk
from nltk.corpus import cmudict
nltk.download('cmudict')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package cmudict to
[nltk_data]     C:\Users\DarpanShah\AppData\Roaming\nltk_data...
[nltk_data]   Package cmudict is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\DarpanShah\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [37]:
# Function to extract article title and text from URL
def extract_article(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        # Extracting article title
        title = soup.find('h1').get_text() if soup.find('h1') else ''
        # Extracting article text
        text = '\n'.join([p.get_text() for p in soup.find_all('p')])
        return title, text
    except Exception as e:
        print(f"Error extracting article from {url}: {e}")
        return None, None

# Read URLs from Excel file
input_file = r"C:\Users\DarpanShah\OneDrive - Connected Analytics\Desktop\Darpan\Shruti\Input.xlsx"
df = pd.read_excel(input_file)

# Create empty columns for title and text
df['Article_Title'] = ''
df['Article_Text'] = ''

# Extract article title and text for each URL and save as a text file
for index, row in df.iterrows():
    url = row['URL']
    title, text = extract_article(url)
    if title and text:
        # Save article title and text in DataFrame
        df.at[index, 'Article_Title'] = title
        df.at[index, 'Article_Text'] = text
        print(f"Article extracted from {url} added to DataFrame.")
    else:
        print(f"Failed to extract article from {url}")

# Save updated DataFrame to a new Excel file
output_file = "output_with_article_info.xlsx"
df.to_excel(output_file, index=False)
print(f"Updated DataFrame saved to {output_file}.")

Article extracted from https://insights.blackcoffer.com/rising-it-cities-and-its-impact-on-the-economy-environment-infrastructure-and-city-life-by-the-year-2040-2/ added to DataFrame.
Article extracted from https://insights.blackcoffer.com/rising-it-cities-and-their-impact-on-the-economy-environment-infrastructure-and-city-life-in-future/ added to DataFrame.
Article extracted from https://insights.blackcoffer.com/internet-demands-evolution-communication-impact-and-2035s-alternative-pathways/ added to DataFrame.
Article extracted from https://insights.blackcoffer.com/rise-of-cybercrime-and-its-effect-in-upcoming-future/ added to DataFrame.
Article extracted from https://insights.blackcoffer.com/ott-platform-and-its-impact-on-the-entertainment-industry-in-future/ added to DataFrame.
Article extracted from https://insights.blackcoffer.com/the-rise-of-the-ott-platform-and-its-impact-on-the-entertainment-industry-by-2040/ added to DataFrame.
Article extracted from https://insights.blackcoff

Article extracted from https://insights.blackcoffer.com/management-challenges-for-future-digitalization-of-healthcare-services/ added to DataFrame.
Article extracted from https://insights.blackcoffer.com/are-we-any-closer-to-preventing-a-nuclear-holocaust/ added to DataFrame.
Article extracted from https://insights.blackcoffer.com/will-technology-eliminate-the-need-for-animal-testing-in-drug-development/ added to DataFrame.
Article extracted from https://insights.blackcoffer.com/will-we-ever-understand-the-nature-of-consciousness/ added to DataFrame.
Article extracted from https://insights.blackcoffer.com/will-we-ever-colonize-outer-space/ added to DataFrame.
Article extracted from https://insights.blackcoffer.com/what-is-the-chance-homo-sapiens-will-survive-for-the-next-500-years/ added to DataFrame.
Article extracted from https://insights.blackcoffer.com/why-does-your-business-need-a-chatbot/ added to DataFrame.
Article extracted from https://insights.blackcoffer.com/how-you-lead-a-p

In [38]:
# Read all the stopwords files and combine them into one list
def read_stopwords(base_path, file_paths):
    stopwords = []
    for file_path in file_paths:
        with open(base_path + file_path, 'r') as file:
            stopwords.extend([line.strip() for line in file])
    return stopwords

# Assuming df is your DataFrame and 'text_column' is the name of the column containing the text
column_data = df['Article_Text'].tolist() 

# Define base file path and stopwords text file
base_path = r'C:\Users\DarpanShah\OneDrive - Connected Analytics\Desktop\Darpan\Shruti\StopWords\\'
stopwords_text_files = ['StopWords_Auditor.txt', 'StopWords_Currencies.txt', 'StopWords_DatesandNumbers.txt',
                        'StopWords_Generic.txt', 'StopWords_GenericLong.txt', 'StopWords_Geographic.txt', 'StopWords_Names.txt']

# Read stopwords
stopwords = read_stopwords(base_path, stopwords_text_files)

# Remove stopwords from the column
def remove_stopwords(text, stopwords):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stopwords]
    return ' '.join(filtered_words)

# Create a new column with the filtered text data
df['Filtered_Text'] = [remove_stopwords(sentence, stopwords) for sentence in column_data]

In [4]:
# Step 1: Read positive and negative word lists
def read_word_list(base_path, file_paths):
    words = []
    for file_path in file_paths:
        with open(base_path + file_path, 'r') as file:
            words.extend([line.strip() for line in file])
    return set(words)  # Convert to set for faster lookup

base_path = r'C:\Users\DarpanShah\OneDrive - Connected Analytics\Desktop\Darpan\Shruti\MasterDictionary\\'
positive_words = read_word_list(base_path, ['positive-words.txt'])
negative_words = read_word_list(base_path, ['negative-words.txt'])

# Step 2: Tokenize text data
def tokenize_text(text):
    return text.split()

# Step 3: Calculate positive and negative scores
def calculate_scores(tokens, positive_words, negative_words):
    positive_score = sum(1 for word in tokens if word.lower() in positive_words)
    negative_score = sum(1 for word in tokens if word.lower() in negative_words)
    return positive_score, negative_score

# Assuming df is your DataFrame and 'text_column' is the name of the column containing the text
column_data = df['Filtered_Text'].tolist()

# Function to compute sentiment score for each row and add it as new columns
def add_sentiment_columns(row):
    tokens = tokenize_text(row['Filtered_Text'])
    positive_score, negative_score = calculate_scores(tokens, positive_words, negative_words)
    return pd.Series({'Positive_Score': positive_score, 'negative_score': negative_score})

# Add new columns 'positive_score' and 'negative_score' to the DataFrame
df[['Positive_Score', 'Negative_Score']] = df.apply(add_sentiment_columns, axis=1)

In [5]:
df['Polarity_Score'] = df['Positive_Score'] - df['Negative_Score']
df['Subjective_Score'] = df['Polarity_Score'].abs()

In [6]:
# Function to calculate the average sentence length
def avg_sentence_length(text):
    sentences = nltk.sent_tokenize(text)
    total_length = sum(len(nltk.word_tokenize(sentence)) for sentence in sentences)
    num_sentences = len(sentences)
    if num_sentences > 0:
        return total_length / num_sentences
    else:
        return 0  # Return 0 if there are no sentences (to avoid division by zero)

# Assuming df is your DataFrame and 'text_column' is the name of the column containing the text
df['Avg_Sentence_Length'] = df['Filtered_Text'].apply(avg_sentence_length)

In [7]:
# Load the CMU Pronouncing Dictionary
d = cmudict.dict()

# Function to count the number of syllables in a word
def syllable_count(word):
    if word.lower() in d:
        return [len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]][0]
    else:
        return 0  # Return 0 for words not found in the dictionary

# Function to determine if a word is complex based on syllable count
def is_complex(word):
    syllables = syllable_count(word)
    return syllables >= 3  # Define a threshold for the number of syllables to consider a word complex

# Function to calculate the percentage of complex words in a sentence
def percentage_complex_words(sentence):
    words = nltk.word_tokenize(sentence)
    complex_words = sum(1 for word in words if is_complex(word))
    total_words = len(words)
    if total_words > 0:
        return (complex_words / total_words) * 100
    else:
        return 0  # Return 0 if there are no words in the sentence

# Assuming df is your DataFrame and 'text_column' is the name of the column containing the text
df['Percentage_Complex_Words'] = df['Filtered_Text'].apply(percentage_complex_words)


In [8]:
# Load the CMU Pronouncing Dictionary
d = cmudict.dict()

# Function to count the number of syllables in a word
def syllable_count(word):
    if word.lower() in d:
        return [len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]][0]
    else:
        return 0  # Return 0 for words not found in the dictionary

# Function to determine if a word is complex based on syllable count
def is_complex(word):
    syllables = syllable_count(word)
    return syllables >= 3  # Define a threshold for the number of syllables to consider a word complex

# Function to calculate the Fog Index
def fog_index(text):
    words = nltk.word_tokenize(text)
    num_words = len(words)
    num_sentences = len(nltk.sent_tokenize(text))
    
    if num_sentences > 0:  # Ensure num_sentences is greater than zero to avoid division by zero
        # Calculate average sentence length
        avg_sentence_length = num_words / num_sentences
        
        # Calculate percentage of complex words
        complex_words = sum(1 for word in words if is_complex(word))
        percentage_complex_words = (complex_words / num_words) * 100
        
        # Calculate Fog Index
        fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
        return fog_index
    else:
        return 0  # Return 0 if there are no sentences

# Assuming df is your DataFrame and 'text_column' is the name of the column containing the text
df['Fog_Index'] = df['Filtered_Text'].apply(fog_index)

In [9]:
def avg_words_per_sentence(text):
    sentences = nltk.sent_tokenize(text)
    num_sentences = len(sentences)
    if num_sentences > 0:
        total_words = sum(len(nltk.word_tokenize(sentence)) for sentence in sentences)
        return total_words / num_sentences
    else:
        return 0  # Return 0 if there are no sentences (to avoid division by zero)

# Assuming df is your DataFrame and 'text_column' is the name of the column containing the text
df['Avg_Words_Per_Sentence'] = df['Filtered_Text'].apply(avg_words_per_sentence)

In [10]:
# Function to count complex words in a text
def count_complex_words(text):
    words = nltk.word_tokenize(text)
    complex_word_count = sum(1 for word in words if is_complex(word))
    return complex_word_count

# Assuming df is your DataFrame and 'text_column' is the name of the column containing the text
df['Complex_Word_Count'] = df['Filtered_Text'].apply(count_complex_words)

In [11]:
def word_count(text):
    words = nltk.word_tokenize(text)
    return len(words)

# Assuming df is your DataFrame and 'text_column' is the name of the column containing the text
df['Word_Count'] = df['Filtered_Text'].apply(word_count)

In [12]:
# Function to calculate the average number of syllables per word
def syllables_per_word(text):
    words = nltk.word_tokenize(text)
    num_words = len(words)
    total_syllables = sum(syllable_count(word) for word in words)
    if num_words > 0:
        return total_syllables / num_words
    else:
        return 0  # Return 0 if there are no words (to avoid division by zero)

# Assuming df is your DataFrame and 'text_column' is the name of the column containing the text
df['Syllables_Per_word'] = df['Filtered_Text'].apply(syllables_per_word)

In [13]:
# Function to count personal pronouns in a text
def count_personal_pronouns(text):
    tokens = nltk.word_tokenize(text)
    tagged_tokens = nltk.pos_tag(tokens)
    personal_pronouns = ['PRP', 'PRP$', 'WP', 'WP$']  # POS tags for personal pronouns
    count = sum(1 for word, pos in tagged_tokens if pos in personal_pronouns)
    return count

# Assuming df is your DataFrame and 'Filtered_Text' is the name of the column containing the text
df['Personal_Pronoun'] = df['Filtered_Text'].apply(count_personal_pronouns)

In [14]:
def avg_word_length(text):
    words = nltk.word_tokenize(text)
    total_length = sum(len(word) for word in words)
    num_words = len(words)
    if num_words > 0:
        return total_length / num_words
    else:
        return 0  # Return 0 if there are no words (to avoid division by zero)

# Assuming df is your DataFrame and 'Filtered_Text' is the name of the column containing the text
df['Avg_Word_Length'] = df['Filtered_Text'].apply(avg_word_length)

In [19]:
df_new = df.drop(['Article_Title', 'Article_Text','Filtered_Text'], axis=1)
df_new.head()

Unnamed: 0,URL_ID,URL,Positive_Score,Negative_Score,Polarity_Score,Subjective_Score,Avg_Sentence_Length,Percentage_Complex_Words,Fog_Index,Avg_Words_Per_Sentence,Complex_Word_Count,Word_Count,Syllables_Per_word,Personal_Pronoun,Avg_Word_Length
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,13,2,11,11,14.074074,26.842105,16.366472,14.074074,102,380,1.694737,1,5.844737
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,54,24,30,30,14.308642,32.700604,18.803698,14.308642,379,1159,1.863676,2,6.077653
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...,41,21,20,20,15.883333,40.188877,22.428884,15.883333,383,953,2.053515,2,6.764953
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,37,63,-26,26,17.5,34.708995,20.883598,17.5,328,945,1.859259,1,6.557672
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...,24,8,16,16,14.904762,30.830671,18.294173,14.904762,193,626,1.827476,1,6.353035


In [20]:
df_new.to_excel(r'C:\Users\DarpanShah\OneDrive - Connected Analytics\Desktop\Darpan\Shruti\Output_Data_Structure_final.xlsx', index=False)