In [1]:
import pandas as pd
import nltk
from textblob import TextBlob
from nltk.tokenize import sent_tokenize, word_tokenize
import requests
from bs4 import BeautifulSoup

#Now downloading required libraries

In [8]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

# Function to count syllables in a word

In [2]:
def count_syllables(word):
    vowels = 'aeiouy'
    count = 0
    word = word.lower()
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith('e'):
        count -= 1
    if count == 0:
        count += 1
    return count

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Reading excel file using pandas

In [5]:
input_data = pd.read_csv('/content/drive/MyDrive/BlackCoffer/Coffer2/Input.xlsx - Sheet1.csv')

In [6]:
# Looping through each row in the input data
for index, row in input_data.iterrows():
    url = row['URL']
    url_id = row['URL_ID']


    # Fetching data from the specified URL
    response = requests.get(url)

    if response.status_code == 200:

        # Parsing HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')


        # Writing the title and article text to a text file
        with open(f"{url_id}.txt", 'w', encoding='utf-8') as file:
            title = soup.find('title').get_text()
            article_text = " ".join([p.get_text() for p in soup.find_all('p')])

            file.write(f"{title}\n\n{article_text}")
    else:
        print(f"Failed to fetch data from {url}")

print("Data extraction complete.")

Failed to fetch data from https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/
Failed to fetch data from https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/
Data extraction complete.


In [9]:
import os
output_data = []

# Looping through each row in the input data again
for index, row in input_data.iterrows():
    url_id = row['URL_ID']
    filename = f"{url_id}.txt"

    if os.path.exists(filename):
        with open(filename, 'r', encoding='utf-8') as file:
            content = file.read()

        # Tokenizing sentences and words from the text content
        sentences = sent_tokenize(content)
        words = word_tokenize(content)
        num_sentences = len(sentences)
        num_words = len(words)
        avg_sentence_length = num_words / num_sentences

        # Calculating average sentence length
        avg_sentence_length = len(words) / len(sentences)

        # Analyzing text sentiment using TextBlob
        blob = TextBlob(content)
        polarity = blob.sentiment.polarity
        subjectivity = blob.sentiment.subjectivity

        # Calculating various text metrics
        complex_word_count = sum(count_syllables(word) > 3 for word in words)
        percentage_complex_words = (complex_word_count / num_words) * 100 if num_words > 0 else 0

        fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)


        word_count = len(words)

        positive_score = (polarity + 1) / 2
        negative_score = (1 - polarity) / 2

        overall_polarity = abs(polarity)

        syllables_per_word = sum(count_syllables(word) for word in words) / num_words if num_words > 0 else 0

        personal_pronouns = sum(1 for word in words if word.lower() in ['i', 'me', 'my', 'mine', 'myself', 'you', 'your', 'yours', 'yourself', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'we', 'us', 'our', 'ours', 'ourselves', 'they', 'them', 'their', 'theirs', 'themselves'])

        avg_word_length = sum(len(word) for word in words) / num_words if num_words > 0 else 0

        # Appending metrics to the output data list
        output_data.append({
                'URL_ID': url_id,
                'POSITIVE SCORE': positive_score,
                'NEGATIVE SCORE': negative_score,
                'POLARITY SCORE': overall_polarity,
                'SUBJECTIVITY SCORE': subjectivity,
                'AVG SENTENCE LENGTH': avg_sentence_length,
                'PERCENTAGE OF COMPLEX WORDS': percentage_complex_words,
                'FOG INDEX': fog_index,
                'AVG NUMBER OF WORDS PER SENTENCE': avg_sentence_length,
                'COMPLEX WORD COUNT': complex_word_count,
                'WORD COUNT': num_words,
                'SYLLABLE PER WORD': syllables_per_word,
                'PERSONAL PRONOUNS': personal_pronouns,
                'AVG WORD LENGTH': avg_word_length
            })
    else:
        print(f"File {filename} does not exist. Skipping...")


File blackassign0036.txt does not exist. Skipping...
File blackassign0049.txt does not exist. Skipping...


In [10]:

# Creating a DataFrame from the output data
output_df = pd.DataFrame(output_data)

# Merging input data with calculated metrics
final_output = pd.merge(input_data, output_df, on='URL_ID')

# Saving the final output to an Excel file
final_output.to_excel('output1.xlsx', index=False)

print("Text analysis complete. Output saved in output.xlsx.")

Text analysis complete. Output saved in output.xlsx.
