In [28]:
# Data Exctraction.....

import pandas as pd
import requests
from bs4 import BeautifulSoup

# Read input Excel file
input_file = "Input.xlsx"
df_input = pd.read_excel(input_file)

# Function to extract article text from URL
def extract_article_text(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")
        
        # Find article title and text
        title = soup.find("tittle").text.strip()
        article_text = ""
        for paragraph in soup.find_all("p"):
            article_text += paragraph.text.strip() + "\n"

        return title, article_text
    except Exception as e:
        print(f"Error extracting article from {url}: {e}")
        return None, None

# Iterate over each row in the DataFrame
for index, row in df_input.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    
    # Extract article text
    title, article_text = extract_article_text(url)
    
    if title and article_text:
        # Save extracted text to a text file
        with open(f"{url_id}.txt", "w", encoding="utf-8") as file:
            file.write(f"Title: {title}\n\n")
            file.write(article_text)
        print(f"Article extracted and saved for URL_ID: {url_id}")
    else:
        print(f"Skipping URL_ID: {url_id}")

print("Extraction complete.")


Error extracting article from https://insights.blackcoffer.com/rising-it-cities-and-its-impact-on-the-economy-environment-infrastructure-and-city-life-by-the-year-2040-2/: 'NoneType' object has no attribute 'text'
Skipping URL_ID: blackassign0001
Error extracting article from https://insights.blackcoffer.com/rising-it-cities-and-their-impact-on-the-economy-environment-infrastructure-and-city-life-in-future/: 'NoneType' object has no attribute 'text'
Skipping URL_ID: blackassign0002
Error extracting article from https://insights.blackcoffer.com/internet-demands-evolution-communication-impact-and-2035s-alternative-pathways/: 'NoneType' object has no attribute 'text'
Skipping URL_ID: blackassign0003
Error extracting article from https://insights.blackcoffer.com/rise-of-cybercrime-and-its-effect-in-upcoming-future/: 'NoneType' object has no attribute 'text'
Skipping URL_ID: blackassign0004
Error extracting article from https://insights.blackcoffer.com/ott-platform-and-its-impact-on-the-ent

In [29]:
# Data Analysis....
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
nltk.download('punkt')
import nltk
nltk.download('stopwords')
import nltk
nltk.download('vader_lexicon')

# Read output structure Excel file
output_file = "Output Data Structure.xlsx"
df_output = pd.read_excel(output_file)

# Function to compute variables from article text
def compute_variables(article_text):
    # Tokenize the article text
    tokens = word_tokenize(article_text.lower())
    
    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    filtered_words = [word for word in tokens if word not in stop_words and word.isalpha()]
    
    # Compute word frequency
    freq_dist = FreqDist(filtered_words)
    top_words = freq_dist.most_common(10)
    
    # Compute sentiment
    sia = SentimentIntensityAnalyzer()
    sentiment_score = sia.polarity_scores(article_text)
    sentiment_label = "Positive" if sentiment_score["compound"] > 0 else "Negative" if sentiment_score["compound"] < 0 else "Neutral"
    
    # Return computed variables
    return len(filtered_words), sentiment_label

# Iterate over each row in the DataFrame
for index, row in df_output.iterrows():
    url_id = row['URL_ID']
    
    # Read article text from saved text file
    try:
        with open(f"{url_id}.txt", "r", encoding="utf-8") as file:
            article_text = file.read()
        
        # Compute variables
        word_count, sentiment = compute_variables(article_text)
        
        # Update DataFrame with computed variables
        df_output.at[index, 'Word_Count'] = word_count
        df_output.at[index, 'Sentiment'] = sentiment
        
        print(f"Variables computed for URL_ID: {url_id}")
    except FileNotFoundError:
        print(f"Text file not found for URL_ID: {url_id}")

# Save computed variables to a new Excel file
output_variables_file = "input.xlsx"
df_output.to_excel(output_variables_file, index=False)

print("DONE...")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\abhis\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abhis\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\abhis\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
  df_output.at[index, 'Sentiment'] = sentiment


Variables computed for URL_ID: blackassign0001
Variables computed for URL_ID: blackassign0002
Variables computed for URL_ID: blackassign0003
Variables computed for URL_ID: blackassign0004
Variables computed for URL_ID: blackassign0005
Variables computed for URL_ID: blackassign0006
Variables computed for URL_ID: blackassign0007
Variables computed for URL_ID: blackassign0008
Variables computed for URL_ID: blackassign0009
Variables computed for URL_ID: blackassign0010
Variables computed for URL_ID: blackassign0011
Variables computed for URL_ID: blackassign0012
Variables computed for URL_ID: blackassign0013
Variables computed for URL_ID: blackassign0014
Variables computed for URL_ID: blackassign0015
Variables computed for URL_ID: blackassign0016
Variables computed for URL_ID: blackassign0017
Variables computed for URL_ID: blackassign0018
Variables computed for URL_ID: blackassign0019
Variables computed for URL_ID: blackassign0020
Variables computed for URL_ID: blackassign0021
Variables com

In [30]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\abhis\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abhis\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\abhis\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [31]:
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.sentiment import SentimentIntensityAnalyzer
from textstat import flesch_reading_ease, gunning_fog, syllable_count

# Read output structure Excel file
output_file = "Output Data Structure.xlsx"
df_output = pd.read_excel(output_file)

# Function to compute variables from article text
def compute_variables(article_text):
    # Tokenize the article text
    tokens = word_tokenize(article_text.lower())
    
    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    filtered_words = [word for word in tokens if word not in stop_words and word.isalpha()]
    
    # Compute word count
    word_count = len(filtered_words)
    
    # Compute average word length
    avg_word_length = sum(len(word) for word in filtered_words) / word_count
    
    # Compute sentiment
    sia = SentimentIntensityAnalyzer()
    sentiment_score = sia.polarity_scores(article_text)
    sentiment_label = "Positive" if sentiment_score["compound"] > 0 else "Negative" if sentiment_score["compound"] < 0 else "Neutral"
    
    # Compute POSITIVE SCORE, NEGATIVE SCORE, POLARITY SCORE, SUBJECTIVITY SCORE
    positive_score = sentiment_score["pos"]
    negative_score = sentiment_score["neg"]
    polarity_score = sentiment_score["compound"]
    subjectivity_score = sentiment_score["compound"] + sentiment_score["neu"]
    
    # Compute average sentence length
    sentences = sent_tokenize(article_text)
    avg_sentence_length = sum(len(word_tokenize(sent)) for sent in sentences) / len(sentences)
    
    # Compute percentage of complex words
    complex_words = [word for word in filtered_words if syllable_count(word) >= 3]
    percentage_complex_words = (len(complex_words) / len(filtered_words)) * 100
    
    # Compute FOG INDEX
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    
    # Compute average number of words per sentence
    avg_words_per_sentence = len(filtered_words) / len(sentences)
    
    # Compute complex word count
    complex_word_count = len(complex_words)
    
    # Compute syllables per word
    syllables_per_word = sum(syllable_count(word) for word in filtered_words) / len(filtered_words)
    
    # Count personal pronouns
    personal_pronouns = ["I", "me", "my", "mine", "myself", "we", "us", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves"]
    personal_pronoun_count = sum(1 for word in filtered_words if word in personal_pronouns)
    
    # Return computed variables as a dictionary
    return {
        'Word_Count': word_count,
        'Avg_Word_Length': avg_word_length,
        'Positive_Score': positive_score,
        'Negative_Score': negative_score,
        'Polarity_Score': polarity_score,
        'Subjectivity_Score': subjectivity_score,
        'Avg_Sentence_Length': avg_sentence_length,
        'Percentage_of_Complex_Words': percentage_complex_words,
        'FOG_Index': fog_index,
        'Avg_Number_of_Words_Per_Sentence': avg_words_per_sentence,
        'Complex_Word_Count': complex_word_count,
        'Syllable_Per_Word': syllables_per_word,
        'Personal_Pronouns': personal_pronoun_count
    }

# Iterate over each row in the DataFrame
for index, row in df_output.iterrows():
    url_id = row['URL_ID']
    
    # Read article text from saved text file
    try:
        with open(f"{url_id}.txt", "r", encoding="utf-8") as file:
            article_text = file.read()
        
        # Compute variables
        variables = compute_variables(article_text)
        
        # Update DataFrame with computed variables
        for key, value in variables.items():
            df_output.at[index, key] = value
        
        print(f"Variables computed for URL_ID: {url_id}")
    except FileNotFoundError:
        print(f"Text file not found for URL_ID: {url_id}")

# Save computed variables to a new Excel file
output_variables_file = "input.xlsx"
df_output.to_excel(output_variables_file, index=False)

print("Data analysis complete.")


Variables computed for URL_ID: blackassign0001
Variables computed for URL_ID: blackassign0002
Variables computed for URL_ID: blackassign0003
Variables computed for URL_ID: blackassign0004
Variables computed for URL_ID: blackassign0005
Variables computed for URL_ID: blackassign0006
Variables computed for URL_ID: blackassign0007
Variables computed for URL_ID: blackassign0008
Variables computed for URL_ID: blackassign0009
Variables computed for URL_ID: blackassign0010
Variables computed for URL_ID: blackassign0011
Variables computed for URL_ID: blackassign0012
Variables computed for URL_ID: blackassign0013
Variables computed for URL_ID: blackassign0014
Variables computed for URL_ID: blackassign0015
Variables computed for URL_ID: blackassign0016
Variables computed for URL_ID: blackassign0017
Variables computed for URL_ID: blackassign0018
Variables computed for URL_ID: blackassign0019
Variables computed for URL_ID: blackassign0020
Variables computed for URL_ID: blackassign0021
Variables com

In [32]:
# complete assignment