In [3]:
import os
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

# Download NLTK resources
import nltk
nltk.download('punkt')
nltk.download('stopwords')

def extract_text(url, output_folder):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extracting article title and text
    title = soup.title.text.strip()
    paragraphs = soup.find_all('p')
    text = ' '.join([p.text.strip() for p in paragraphs])

    # Saving extracted text to a file
    url_id = re.sub(r'\W+', '', url)  # Removing non-alphanumeric characters from URL
    file_path = os.path.join(output_folder, f'{url_id}.txt')

    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(f'Title: {title}\n\n{text}')

    return file_path, title, text

def calculate_variables(text):
    # Sentiment Analysis
    stop_words = set(stopwords.words('/content/master stopwords.txt'))
    words = [word.lower() for word in word_tokenize(text) if word.isalpha() and word.lower() not in stop_words]

    positive_words = set(open('/content/positive-words.txt').read().split())
    negative_words = set(open('/content/negative words 1.txt').read().split())

    positive_score = sum(1 for word in words if word in positive_words)
    negative_score = sum(1 for word in words if word in negative_words)

    polarity_score = (positive_score - negative_score) / (positive_score + negative_score + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (len(words) + 0.000001)

    # Analysis of Readability
    sentences = sent_tokenize(text)
    avg_sentence_length = len(words) / len(sentences)

    complex_words = [word for word in words if len(word) > 2]
    percentage_of_complex_words = len(complex_words) / len(words)

    fog_index = 0.4 * (avg_sentence_length + percentage_of_complex_words)

    # Average Number of Words Per Sentence
    avg_words_per_sentence = len(words) / len(sentences)

    # Complex Word Count
    complex_word_count = len(complex_words)

    # Word Count
    word_count = len(words)

    # Syllable Count Per Word
    syllable_per_word = sum([count_syllables(word) for word in words]) / len(words)

    # Personal Pronouns
    personal_pronouns = sum(1 for word in words if word.lower() in ['i', 'we', 'my', 'ours', 'us'])

    # Average Word Length
    avg_word_length = sum(len(word) for word in words) / len(words)

    return (positive_score, negative_score, polarity_score, subjectivity_score,
            avg_sentence_length, percentage_of_complex_words, fog_index,
            avg_words_per_sentence, complex_word_count, word_count,
            syllable_per_word, personal_pronouns, avg_word_length)

def count_syllables(word):
    # A simple syllable counting function
    count = 0
    vowels = "aeiouy"
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
            if word.endswith("e"):
                count -= 1
    if count == 0:
        count += 1
    return count

def main():
    # Load input data
    input_df = pd.read_excel('/content/Input (1).xlsx')
    output_df = pd.DataFrame(columns=['URL_ID', 'TITLE', 'TEXT'] + [f'VAR{i}' for i in range(1, 15)])

    # Create output folder
    output_folder = 'extracted_texts'
    os.makedirs(output_folder, exist_ok=True)

    # Extract data and perform analysis
    for index, row in input_df.iterrows():
        url = row['URL']
        file_path, title, text = extract_text(url, output_folder)
        variables = calculate_variables(text)
        output_df = output_df.append({'URL_ID': file_path, 'TITLE': title, 'TEXT': text, **dict(zip([f'VAR{i}' for i in range(1, 15)], variables))}, ignore_index=True)

    # Save output to Excel
    output_df.to_excel('Output.xlsx', index=False)

if __name__ == "__main__":
    main()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  output_df = output_df.append({'URL_ID': file_path, 'TITLE': title, 'TEXT': text, **dict(zip([f'VAR{i}' for i in range(1, 15)], variables))}, ignore_index=True)
  output_df = output_df.append({'URL_ID': file_path, 'TITLE': title, 'TEXT': text, **dict(zip([f'VAR{i}' for i in range(1, 15)], variables))}, ignore_index=True)
  output_df = output_df.append({'URL_ID': file_path, 'TITLE': title, 'TEXT': text, **dict(zip([f'VAR{i}' for i in range(1, 15)], variables))}, ignore_index=True)
  output_df = output_df.append({'URL_ID': file_path, 'TITLE': title, 'TEXT': text, **dict(zip([f'VAR{i}' for i in range(1, 15)], variables))}, ignore_index=True)
  output_df = output_df.append({'URL_ID': file_path, 'TITLE': title, 'TEXT': text, **dict(zip([f'VAR{i}' for i in range(1,