In [1]:
!cd  /content/drive/MyDrive/20211030_Test_Assignment

In [9]:
!pip install syllables

Collecting syllables
  Downloading syllables-1.0.7-py3-none-any.whl (15 kB)
Collecting cmudict<2.0.0,>=1.0.11 (from syllables)
  Downloading cmudict-1.0.13-py3-none-any.whl (939 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m939.3/939.3 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting importlib-metadata<6.0.0,>=5.1.0 (from syllables)
  Downloading importlib_metadata-5.2.0-py3-none-any.whl (21 kB)
Collecting importlib-resources<6.0.0,>=5.10.1 (from cmudict<2.0.0,>=1.0.11->syllables)
  Downloading importlib_resources-5.13.0-py3-none-any.whl (32 kB)
Installing collected packages: importlib-resources, importlib-metadata, cmudict, syllables
  Attempting uninstall: importlib-resources
    Found existing installation: importlib-resources 6.0.0
    Uninstalling importlib-resources-6.0.0:
      Successfully uninstalled importlib-resources-6.0.0
  Attempting uninstall: importlib-metadata
    Found existing installation: importlib-metadata 4.6.4
    Uninstalling 

In [14]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
import nltk
from textstat import text_standard
from nltk.tokenize import word_tokenize
from textblob import TextBlob
import syllables

# Download nltk data (only required once)
nltk.download('punkt')

def extract_article_text(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        article_title = soup.find('h1').text.strip()
        article_text = '\n'.join([p.text.strip() for p in soup.find_all('p')])
        return article_title, article_text
    except Exception as e:
        print(f"Error extracting article from URL: {url}")
        print(e)
        return None, None

def syllable_count(word):
    return syllables.estimate(word)

def compute_text_metrics(article_text):
    sentences = nltk.sent_tokenize(article_text)
    words = word_tokenize(article_text)

    num_syllables = sum([syllable_count(word) for word in words])
    num_complex_words = sum([1 for word in words if syllable_count(word) >= 3])

    avg_sentence_length = len(words) / len(sentences)
    avg_word_length = sum(len(word) for word in words) / len(words)

    personal_pronouns = sum(1 for word in words if word.lower() in ['i', 'me', 'my', 'mine', 'we', 'us', 'our', 'ours'])

    # Perform sentiment analysis using TextBlob
    blob = TextBlob(article_text)
    sentiment_score = blob.sentiment.polarity
    subjectivity_score = blob.sentiment.subjectivity

    return {
        'POSITIVE SCORE': sentiment_score if sentiment_score > 0 else 0,
        'NEGATIVE SCORE': abs(sentiment_score) if sentiment_score < 0 else 0,
        'POLARITY SCORE': sentiment_score,
        'SUBJECTIVITY SCORE': subjectivity_score,
        'AVG SENTENCE LENGTH': avg_sentence_length,
        'PERCENTAGE OF COMPLEX WORDS': (num_complex_words / len(words)) * 100,
        'FOG INDEX': text_standard(article_text, float_output=True),
        'AVG NUMBER OF WORDS PER SENTENCE': len(words) / len(sentences),
        'COMPLEX WORD COUNT': num_complex_words,
        'WORD COUNT': len(words),
        'SYLLABLE PER WORD': num_syllables / len(words),
        'PERSONAL PRONOUNS': personal_pronouns,
        'AVG WORD LENGTH': avg_word_length
    }

def process_articles(input_file, output_file):
    df = pd.read_excel(input_file)
    results = []
    for index, row in df.iterrows():
        url_id = row['URL_ID']
        url = row['URL']
        article_title, article_text = extract_article_text(url)
        if article_title and article_text:
            metrics = compute_text_metrics(article_text)
            result = {
                'URL_ID': url_id,
                'URL': url,
                **metrics
            }
            results.append(result)
            print(f"Article {url_id} processed successfully.")

    output_df = pd.DataFrame(results)
    output_df.to_excel(output_file, index=False)
    print(f"Output saved to {output_file}.")

if __name__ == "__main__":
    input_file = "/content/drive/MyDrive/20211030_Test_Assignment/Input.xlsx"
    output_file = output_file = "/content/drive/MyDrive/Copy_of_Output_Data_Structure.xlsx"

    process_articles(input_file, output_file)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Article 123.0 processed successfully.
Article 321.0 processed successfully.
Article 2345.0 processed successfully.
Article 4321.0 processed successfully.
Article 432.0 processed successfully.
Article 2893.8 processed successfully.
Article 3355.6 processed successfully.
Article 3817.4 processed successfully.
Article 4279.2 processed successfully.
Article 4741.0 processed successfully.
Article 5202.8 processed successfully.
Article 5664.6 processed successfully.
Article 6126.4 processed successfully.
Article 6588.2 processed successfully.
Article 7050.0 processed successfully.
Article 7511.8 processed successfully.
Article 7973.6 processed successfully.
Article 8435.4 processed successfully.
Article 8897.2 processed successfully.
Article 9359.0 processed successfully.
Article 9820.8 processed successfully.
Article 10282.6 processed successfully.
Article 10744.4 processed successfully.
Article 11206.2 processed successfully.
Error extracting article from URL: https://insights.blackcoffer.