In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os
import re

# Function to extract article title and text
def extract_article_content(url):
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extracting article title
        title = soup.find('title').get_text(strip=True) if soup.find('title') else 'No Title Found'

        # Extracting article text
        article_text = []
        for paragraph in soup.find_all('p'):
            article_text.append(paragraph.get_text(strip=True))

        # Joining paragraphs to form complete article text
        article_text = '\n\n'.join(article_text)

        return title, article_text

    except Exception as e:
        print(f"Error extracting content from {url}: {e}")
        return None, None

# Load the Excel file
excel_file = '/content/drive/MyDrive/Input.xlsx'  # Replace with your actual file path
df = pd.read_excel(excel_file)

# Create a directory to save the articles
output_dir = 'extracted_articles'
os.makedirs(output_dir, exist_ok=True)

# Loop through each URL and extract content
for index, row in df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    try:
        title, content = extract_article_content(url)
        if content:
            # Cleaning filename for safe usage
            filename = re.sub(r'[^\w\s-]', '', str(url_id).strip())
            filename = filename.replace(" ", "_") + ".txt"

            # Save the content to a text file
            with open(os.path.join(output_dir, filename), 'w', encoding='utf-8') as file:
                file.write(f"Title: {title}\n\n{content}")

            print(f"Article {url_id} saved successfully.")
        else:
            print(f"Skipping {url_id}: No content extracted.")
    except Exception as e:
        print(f"Failed to extract or save article {url_id}: {e}")


Article blackassign0001 saved successfully.
Article blackassign0002 saved successfully.
Article blackassign0003 saved successfully.
Article blackassign0004 saved successfully.
Article blackassign0005 saved successfully.
Article blackassign0006 saved successfully.
Article blackassign0007 saved successfully.
Article blackassign0008 saved successfully.
Article blackassign0009 saved successfully.
Article blackassign0010 saved successfully.
Article blackassign0011 saved successfully.
Article blackassign0012 saved successfully.
Article blackassign0013 saved successfully.
Article blackassign0014 saved successfully.
Article blackassign0015 saved successfully.
Article blackassign0016 saved successfully.
Article blackassign0017 saved successfully.
Article blackassign0018 saved successfully.
Article blackassign0019 saved successfully.
Article blackassign0020 saved successfully.
Article blackassign0021 saved successfully.
Article blackassign0022 saved successfully.
Article blackassign0023 saved su

In [8]:
pip install textstat

Collecting textstat
  Downloading textstat-0.7.3-py3-none-any.whl (105 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyphen (from textstat)
  Downloading pyphen-0.15.0-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.15.0 textstat-0.7.3


In [13]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from textblob import TextBlob
from nltk.tag import pos_tag
from textstat import flesch_reading_ease, syllable_count, lexicon_count
# Function to clean text using stop words list
def clean_text(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text.lower())
    cleaned_text = [word for word in word_tokens if word.isalnum() and word not in stop_words]
    return ' '.join(cleaned_text)
# Function to calculate derived variables
def calculate_derived_variables(text):
    blob = TextBlob(text)
    positive_score = sum(1 for sentence in blob.sentences if sentence.sentiment.polarity > 0)
    negative_score = sum(1 for sentence in blob.sentences if sentence.sentiment.polarity < 0)
    polarity_score = blob.sentiment.polarity
    subjectivity_score = blob.sentiment.subjectivity

    sentences = sent_tokenize(text)
    avg_sentence_length = sum(len(word_tokenize(sentence)) for sentence in sentences) / len(sentences)
    complex_word_count = sum(1 for word, pos in pos_tag(word_tokenize(text)) if lexicon_count(word) > 2)
    word_count = lexicon_count(text)
    syllable_count_per_word = sum(syllable_count(word) for word in word_tokenize(text)) / word_count
    personal_pronouns = sum(1 for word, pos in pos_tag(word_tokenize(text)) if pos == 'PRP')

    avg_word_length = sum(len(word) for word in word_tokenize(text)) / word_count

    return (positive_score, negative_score, polarity_score, subjectivity_score,
            avg_sentence_length, complex_word_count, word_count,
            syllable_count_per_word, personal_pronouns, avg_word_length)

# Function to compute Fog Index
def fog_index(avg_sentence_length, percentage_complex_words):
    return 0.4 * (avg_sentence_length + percentage_complex_words)

In [17]:
import nltk
nltk.download('punkt')
import nltk
nltk.download('averaged_perceptron_tagger')
# Create a directory to save the articles
output_dir = 'extracted_articles'
os.makedirs(output_dir, exist_ok=True)

# Define output columns
output_columns = [
    'URL_ID', 'URL', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE', 'SUBJECTIVITY SCORE',
    'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX', 'AVG NUMBER OF WORDS PER SENTENCE',
    'COMPLEX WORD COUNT', 'WORD COUNT', 'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH'
]

output_data = []

# Loop through each URL and extract content
for index, row in df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    try:
        title, content = extract_article_content(url)
        if content:
            cleaned_content = clean_text(content)

            # Calculate derived variables
            (positive_score, negative_score, polarity_score, subjectivity_score,
             avg_sentence_length, complex_word_count, word_count,
             syllable_per_word, personal_pronouns, avg_word_length) = calculate_derived_variables(cleaned_content)

            percentage_complex_words = complex_word_count / word_count * 100
            fog_index_value = fog_index(avg_sentence_length, percentage_complex_words)

            # Save analysis results to output data
            output_data.append({
                'URL_ID': url_id,
                'URL': url,
                'POSITIVE SCORE': positive_score,
                'NEGATIVE SCORE': negative_score,
                'POLARITY SCORE': polarity_score,
                'SUBJECTIVITY SCORE': subjectivity_score,
                'AVG SENTENCE LENGTH': avg_sentence_length,
                'PERCENTAGE OF COMPLEX WORDS': percentage_complex_words,
                'FOG INDEX': fog_index_value,
                'AVG NUMBER OF WORDS PER SENTENCE': avg_sentence_length,
                'COMPLEX WORD COUNT': complex_word_count,
                'WORD COUNT': word_count,
                'SYLLABLE PER WORD': syllable_per_word,
                'PERSONAL PRONOUNS': personal_pronouns,
                'AVG WORD LENGTH': avg_word_length
            })

            print(f"Analysis completed for {url_id}.")

        else:
            print(f"Skipping {url_id}: No content extracted.")
    except Exception as e:
        print(f"Failed to process article {url_id}: {e}")
# Save output data to Excel
output_excel_file = '/content/drive/MyDrive/Output Data Structure.xlsx'  # Replace with desired output file path
output_df = pd.DataFrame(output_data, columns=output_columns)
output_df.to_excel(output_excel_file, index=False)

print("Analysis results saved successfully.")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Analysis completed for blackassign0001.
Analysis completed for blackassign0002.
Analysis completed for blackassign0003.
Analysis completed for blackassign0004.
Analysis completed for blackassign0005.
Analysis completed for blackassign0006.
Analysis completed for blackassign0007.
Analysis completed for blackassign0008.
Analysis completed for blackassign0009.
Analysis completed for blackassign0010.
Analysis completed for blackassign0011.
Analysis completed for blackassign0012.
Analysis completed for blackassign0013.
Analysis completed for blackassign0014.
Analysis completed for blackassign0015.
Analysis completed for blackassign0016.
Analysis completed for blackassign0017.
Analysis completed for blackassign0018.
Analysis completed for blackassign0019.
Analysis completed for blackassign0020.
Analysis completed for blackassign0021.
Analysis completed for blackassign0022.
Analysis completed for blackassign0023.
Analysis completed for blackassign0024.
Analysis completed for blackassign0025.
