##### MATERIALS - https://drive.google.com/drive/folders/1ltdsXAS_zaZ3hI-q9eze_QCzHciyYAJY

### DATA EXTRACTION

In [7]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Read URLs from Excel file
input_file = r"C:\Users\User\Downloads\Input.xlsx"
df = pd.read_excel(input_file)

# Function to extract article text from URL
def extract_article_text(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        # Extract article text
        article_text = ""
        for paragraph in soup.find_all('p'):
            article_text += paragraph.get_text() + "\n"
        return article_text
    except Exception as e:
        print(f"Error extracting article from {url}: {e}")
        return None

# Create a new column to store extracted text
df['Extracted_Text'] = ""

# Iterate over each URL in the DataFrame
for index, row in df.iterrows():
    url = row['URL']

    # Extract article text
    extracted_text = extract_article_text(url)

    if extracted_text:
        # Update the corresponding row in the DataFrame with the extracted text
        df.at[index, 'Extracted_Text'] = extracted_text
        print(f"Extracted text for URL_ID {row['URL_ID']}")
    else:
        print(f"Skipping URL_ID {row['URL_ID']}. Could not extract article.")

# Save the modified DataFrame to Excel
output_file = "Output.xlsx"
df.to_excel(output_file, index=False)

print(f"Extraction completed. Output saved to {output_file}")


Extracted text for URL_ID blackassign0001
Extracted text for URL_ID blackassign0002
Extracted text for URL_ID blackassign0003
Extracted text for URL_ID blackassign0004
Extracted text for URL_ID blackassign0005
Extracted text for URL_ID blackassign0006
Extracted text for URL_ID blackassign0007
Extracted text for URL_ID blackassign0008
Extracted text for URL_ID blackassign0009
Extracted text for URL_ID blackassign0010
Extracted text for URL_ID blackassign0011
Extracted text for URL_ID blackassign0012
Extracted text for URL_ID blackassign0013
Extracted text for URL_ID blackassign0014
Extracted text for URL_ID blackassign0015
Extracted text for URL_ID blackassign0016
Extracted text for URL_ID blackassign0017
Extracted text for URL_ID blackassign0018
Extracted text for URL_ID blackassign0019
Extracted text for URL_ID blackassign0020
Extracted text for URL_ID blackassign0021
Extracted text for URL_ID blackassign0022
Extracted text for URL_ID blackassign0023
Extracted text for URL_ID blackass

### DATA PREPROCESSING

In [16]:
import pandas as pd
import numpy as np
data=pd.read_excel(r"C:\Users\User\Downloads\Output.xlsx")
data['Extracted_Text'][0]

'Grafana Dashboard to visualize and analyze sensors’ data\nMVP for a software that analyses content from audio (Pharma-based)\nData Engineering and Management tool (Airbyte) with custom data connectors to manage CRM database\nText Summarizing Tool to scrape and summarize pubmed medical papers\xa0\nMethodology for ETL Discovery Tool using LLMA, OpenAI, Langchain\nMethodology for database discovery tool using openai, LLMA, Langchain\nChatbot using VoiceFlow\nHow To Secure (SSL) Nginx with Let’s Encrypt on Ubuntu (Cloud VM, GCP, AWS, Azure, Linode) and Add Domain\nRising IT cities and its impact on the economy, environment, infrastructure, and city life by the year 2040.\nRising IT Cities and Their Impact on the Economy, Environment, Infrastructure, and City Life in Future\nInternet Demand’s Evolution, Communication Impact, and 2035’s Alternative Pathways\nRise of Cybercrime and its Effect in upcoming Future\nAI/ML and Predictive Modeling\nSolution for Contact Centre Problems\nHow to Setu

In [17]:
data.head()

Unnamed: 0,URL_ID,URL,Extracted_Text
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,Grafana Dashboard to visualize and analyze sen...
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,Grafana Dashboard to visualize and analyze sen...
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...,Grafana Dashboard to visualize and analyze sen...
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,Grafana Dashboard to visualize and analyze sen...
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...,Grafana Dashboard to visualize and analyze sen...


In [20]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string

# Download NLTK resources (stop words)
nltk.download('stopwords')

# Load your data (replace "input_file.xlsx" with your actual file path)
data = pd.read_excel(r"C:\Users\User\Downloads\Output.xlsx")

# Function to clean text
def clean_text(text):
    # Remove special characters and symbols
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove digits
    text = text.translate(str.maketrans('', '', string.digits))
    # Convert text to lowercase
    text = text.lower()
    # Split text into words
    words = text.split()
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.lower() not in stop_words]
    # Remove duplicate words
    words = list(set(words))
    # Join words back into text
    cleaned_text = ' '.join(words)
    return cleaned_text

# Apply text cleaning function to the 'Extracted_Text' column
data['Extracted_Text'] = data['Extracted_Text'].apply(clean_text)

# Display the cleaned text
data.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,URL_ID,URL,Extracted_Text
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,hub content brand modeling ie aiml specialist ...
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,taken impossible aiml skill andor favorable in...
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...,aiml robust openai scrape vulnerabilities capa...
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,elections reduce aiml generalised behaviour fo...
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...,players decadesold aiml remain enrich robust m...


In [21]:
data['Extracted_Text'].head()

0    hub content brand modeling ie aiml specialist ...
1    taken impossible aiml skill andor favorable in...
2    aiml robust openai scrape vulnerabilities capa...
3    elections reduce aiml generalised behaviour fo...
4    players decadesold aiml remain enrich robust m...
Name: Extracted_Text, dtype: object

## DATA ANALYSIS

In [33]:
import os
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

# Download NLTK resources (CMU Pronouncing Dictionary)
nltk.download('punkt')
nltk.download('stopwords')

# Function to load positive and negative words from Master Dictionary
def load_master_dictionary(directory):
    positive_words = set()
    negative_words = set()

    # Load positive words
    with open(os.path.join(directory, "positive-words.txt"), "r") as file:
        for line in file:
            positive_words.add(line.strip())

    # Load negative words
    with open(os.path.join(directory, "negative-words.txt"), "r") as file:
        for line in file:
            negative_words.add(line.strip())

    return positive_words, negative_words

# Function to calculate positive and negative scores for a given text
def calculate_scores(text, positive_words, negative_words):
    # Tokenize text into words
    words = word_tokenize(text.lower())

    # Filter out stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Calculate positive and negative scores
    positive_score = sum(1 for word in words if word in positive_words)
    negative_score = sum(1 for word in words if word in negative_words) * -1

    return positive_score, negative_score

# Function to calculate Polarity Score
def calculate_polarity_score(pos_score, neg_score):
    return (pos_score - neg_score) / ((pos_score + neg_score) + 0.000001)

# Function to calculate Subjectivity Score
def calculate_subjectivity_score(pos_score, neg_score, total_words):
    return (pos_score + neg_score) / (total_words + 0.000001)

# Function to analyze readability and calculate additional derived variables
def analyze_readability(text):
    # Tokenize text into sentences
    sentences = sent_tokenize(text)
    num_sentences = len(sentences)

    # Tokenize text into words
    words = word_tokenize(text.lower())

    # Filter out stop words and punctuation
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words and word.isalnum()]

    # Calculate derived variables
    word_count = len(words)
    avg_sentence_length = word_count / num_sentences
    complex_word_count = sum(1 for word in words if count_syllables(word) > 2)
    percentage_complex_words = (complex_word_count / word_count) * 100
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    avg_words_per_sentence = word_count / num_sentences
    syllables_per_word = sum(count_syllables(word) for word in words) / word_count
    personal_pronouns = sum(1 for word in words if word.lower() in ['i', 'me', 'my', 'mine', 'myself', 'we', 'us', 'our', 'ours', 'ourselves'])
    avg_word_length = sum(len(word) for word in words) / word_count

    return avg_sentence_length, percentage_complex_words, fog_index, avg_words_per_sentence, syllables_per_word, personal_pronouns, avg_word_length

# Function to count syllables in a word
def count_syllables(word):
    # Implementation of count_syllables function (using CMU Pronouncing Dictionary or other method)
    # This function should return the number of syllables in the word
    # For demonstration purposes, let's assume a simple implementation using vowel counting
    vowels = 'aeiouAEIOU'
    syllable_count = 0
    previous_char_was_vowel = False
    
    for char in word:
        if char in vowels and not previous_char_was_vowel:
            syllable_count += 1
            previous_char_was_vowel = True
        elif char not in vowels:
            previous_char_was_vowel = False
            
    # Handle exceptions like words ending with "es", "ed"
    if word.endswith(('es', 'ed')):
        syllable_count -= 1
    
    # Ensure syllable count is at least 1
    return max(syllable_count, 1)


# Load positive and negative words from Master Dictionary
    master_dictionary_dir = "C:\\Users\\User\\Downloads\\MasterDictionary"  # Replace with the correct directory path
    positive_words, negative_words = load_master_dictionary(master_dictionary_dir)

# Load the extracted texts (assuming you have a dataframe named 'data')
extracted_texts = data['Extracted_Text']  # Assuming 'data' is loaded with your text data

# Calculate positive and negative scores, and additional derived variables for each extracted text
positive_scores = []
negative_scores = []
avg_sentence_lengths = []
percentage_complex_words_list = []
fog_indices = []
avg_words_per_sentence_list = []
syllables_per_word_list = []
personal_pronouns_list = []
avg_word_lengths = []

for text in extracted_texts:
    pos_score, neg_score = calculate_scores(text, positive_words, negative_words)
    positive_scores.append(pos_score)
    negative_scores.append(neg_score)
    avg_sentence_length, percentage_complex_words, fog_index, avg_words_per_sentence, syllables_per_word, personal_pronouns, avg_word_length = analyze_readability(text)
    avg_sentence_lengths.append(avg_sentence_length)
    percentage_complex_words_list.append(percentage_complex_words)
    fog_indices.append(fog_index)
    avg_words_per_sentence_list.append(avg_words_per_sentence)
    syllables_per_word_list.append(syllables_per_word)
    personal_pronouns_list.append(personal_pronouns)
    avg_word_lengths.append(avg_word_length)

# Add calculated derived variables to the dataframe
data['Positive_Score'] = positive_scores
data['Negative_Score'] = negative_scores
data['Polarity_Score'] = [calculate_polarity_score(pos, neg) for pos, neg in zip(positive_scores, negative_scores)]
data['Subjectivity_Score'] = [calculate_subjectivity_score(pos, neg, len(word_tokenize(text.lower()))) for pos, neg, text in zip(positive_scores, negative_scores, extracted_texts)]
data['Avg_Sentence_Length'] = avg_sentence_lengths
data['Percentage_of_Complex_Words'] = percentage_complex_words_list
data['FOG_Index'] = fog_indices
data['Avg_Words_Per_Sentence'] = avg_words_per_sentence_list
data['Syllables_Per_Word'] = syllables_per_word_list
data['Personal_Pronouns'] = personal_pronouns_list
data['Avg_Word_Length'] = avg_word_lengths

data.head()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,URL_ID,URL,Extracted_Text,Positive_Score,Negative_Score,Polarity_Score,Subjectivity_Score,Avg_Sentence_Length,Percentage_of_Complex_Words,FOG_Index,Avg_Words_Per_Sentence,Syllables_Per_Word,Personal_Pronouns,Avg_Word_Length
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,hub content brand modeling ie aiml specialist ...,11,-3,1.75,0.037037,208.0,34.615385,97.046154,208.0,2.245192,1,6.701923
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,taken impossible aiml skill andor favorable in...,46,-25,3.380952,0.032659,621.0,40.740741,264.696296,621.0,2.400966,1,7.450886
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...,aiml robust openai scrape vulnerabilities capa...,33,-19,3.714285,0.029979,452.0,48.672566,200.269027,452.0,2.581858,1,7.809735
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,elections reduce aiml generalised behaviour fo...,33,-64,-3.129032,-0.058491,514.0,43.579767,223.031907,514.0,2.503891,1,7.776265
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...,players decadesold aiml remain enrich robust m...,25,-10,2.333333,0.037879,381.0,42.257218,169.302887,381.0,2.433071,1,7.388451


In [36]:
data.drop(columns=['Extracted_Text'], inplace=True)

In [37]:
data.head()

Unnamed: 0,URL_ID,URL,Positive_Score,Negative_Score,Polarity_Score,Subjectivity_Score,Avg_Sentence_Length,Percentage_of_Complex_Words,FOG_Index,Avg_Words_Per_Sentence,Syllables_Per_Word,Personal_Pronouns,Avg_Word_Length
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,11,-3,1.75,0.037037,208.0,34.615385,97.046154,208.0,2.245192,1,6.701923
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,46,-25,3.380952,0.032659,621.0,40.740741,264.696296,621.0,2.400966,1,7.450886
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...,33,-19,3.714285,0.029979,452.0,48.672566,200.269027,452.0,2.581858,1,7.809735
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,33,-64,-3.129032,-0.058491,514.0,43.579767,223.031907,514.0,2.503891,1,7.776265
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...,25,-10,2.333333,0.037879,381.0,42.257218,169.302887,381.0,2.433071,1,7.388451


In [39]:
output_file = "Output Data Structure.xlsx"
data.to_excel(output_file, index=False)