In [1]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Function to extract article text from a given URL
def extract_article_text(url, url_id):
    try:
        # Fetch the HTML content of the URL
        response = requests.get(url)
        html_content = response.text

        # Parse HTML using BeautifulSoup
        soup = BeautifulSoup(html_content, 'html.parser')

        # Extract article title
        article_title = soup.title.text.strip() if soup.title else "No Title Found"

        # Extract article text
        article_text = ""
        article_body = soup.find('body')  # Assume article is contained within the <body> tag
        if article_body:
            for paragraph in article_body.find_all('p'):  # Extract paragraphs
                article_text += paragraph.text.strip() + "\n"

        # Save the extracted article text into a text file
        with open(f"articles/{url_id}.txt", "w", encoding="utf-8") as file:
            file.write(f"Title: {article_title}\n\n{article_text}")

        print(f"Article text extracted and saved for URL_ID: {url_id}")
    except Exception as e:
        print(f"Error extracting article text for URL_ID: {url_id}. Error: {e}")

# Read the input.xlsx file
input_data = pd.read_excel("/content/Input.xlsx")

# Create a folder to save the extracted article text files
if not os.path.exists("articles"):
    os.makedirs("articles")

# Iterate through each row in the input data
for index, row in input_data.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    extract_article_text(url, url_id)


Article text extracted and saved for URL_ID: blackassign0001
Article text extracted and saved for URL_ID: blackassign0002
Article text extracted and saved for URL_ID: blackassign0003
Article text extracted and saved for URL_ID: blackassign0004
Article text extracted and saved for URL_ID: blackassign0005
Article text extracted and saved for URL_ID: blackassign0006
Article text extracted and saved for URL_ID: blackassign0007
Article text extracted and saved for URL_ID: blackassign0008
Article text extracted and saved for URL_ID: blackassign0009
Article text extracted and saved for URL_ID: blackassign0010
Article text extracted and saved for URL_ID: blackassign0011
Article text extracted and saved for URL_ID: blackassign0012
Article text extracted and saved for URL_ID: blackassign0013
Article text extracted and saved for URL_ID: blackassign0014
Article text extracted and saved for URL_ID: blackassign0015
Article text extracted and saved for URL_ID: blackassign0016
Article text extracted a

In [2]:
import os

# Function to load words from text files in a folder
def load_word_list_from_folder(folder_path, encoding='utf-8'):
    word_list = []
    for filename in os.listdir(folder_path):
        with open(os.path.join(folder_path, filename), 'r', encoding=encoding) as file:
            word_list += file.read().splitlines()
    return word_list


positive_words = load_word_list_from_folder("/content//positive/", encoding='latin-1')
negative_words = load_word_list_from_folder("/content//negative/", encoding='latin-1')
stopwords = load_word_list_from_folder("/content/stopwords/", encoding='latin-1')


print("Positive words:", len(positive_words))
print("Negative words:", len(negative_words))
print("Stopwords:", len(stopwords))


Positive words: 2006
Negative words: 4783
Stopwords: 14107


In [3]:
import os

def write_word_list_to_file(word_list, filename):
    with open(filename, 'w') as file:
        for word in word_list:
            file.write(word + '\n')

# Write positive words to positive.txt
write_word_list_to_file(positive_words, "positive.txt")

# Write negative words to negative.txt
write_word_list_to_file(negative_words, "negative.txt")

# Write stopwords to stopwords.txt
write_word_list_to_file(stopwords, "stopwords.txt")

print("Words saved to files successfully in the current directory.")


Words saved to files successfully in the current directory.


In [5]:
# Create dictionaries of positive and negative words
positive_dict = {word: True for word in positive_words if word not in stopwords}
negative_dict = {word: True for word in negative_words if word not in stopwords}



In [7]:
len(positive_dict)
len(negative_dict)

4779

In [4]:
positive = [word for word in positive_words if word not in stopwords]
negative = [word for word in negative_words if word not in stopwords]

In [12]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.tokenize import word_tokenize, sent_tokenize


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [19]:
# Function to calculate positive score for a given text
def calculate_positive_score(text):
    # Tokenize the text into words
    words = word_tokenize(text)

    # Calculate positive score
    positive_score = sum(1 for word in words if  word.lower() in positive)
    return positive_score

# Directory where article files are stored
article_folder = "articles/"

# Calculate positive score for each file in the article folder
positive_scores = {}
for filename in os.listdir(article_folder):
    with open(os.path.join(article_folder, filename), 'r', encoding='utf-8') as file:
        text = file.read()
    positive_score = calculate_positive_score(text)
    positive_scores[filename] = positive_score

# Print positive scores for each file
for filename, score in sorted(positive_scores.items()):
    print(f"Positive score for {filename}: {score}")

Positive score for blackassign0001.txt: 11
Positive score for blackassign0002.txt: 62
Positive score for blackassign0003.txt: 43
Positive score for blackassign0004.txt: 43
Positive score for blackassign0005.txt: 26
Positive score for blackassign0006.txt: 92
Positive score for blackassign0007.txt: 29
Positive score for blackassign0008.txt: 35
Positive score for blackassign0009.txt: 43
Positive score for blackassign0010.txt: 65
Positive score for blackassign0011.txt: 65
Positive score for blackassign0012.txt: 85
Positive score for blackassign0013.txt: 43
Positive score for blackassign0014.txt: 26
Positive score for blackassign0015.txt: 39
Positive score for blackassign0016.txt: 39
Positive score for blackassign0017.txt: 48
Positive score for blackassign0018.txt: 38
Positive score for blackassign0019.txt: 64
Positive score for blackassign0020.txt: 13
Positive score for blackassign0021.txt: 27
Positive score for blackassign0022.txt: 17
Positive score for blackassign0023.txt: 35
Positive sc

In [20]:
# Function to calculate negative score for a given text
def calculate_negative_score(text):
    # Tokenize the text into words
    words = word_tokenize(text)

    # Calculate positive score
    negative_score = sum(1 for word in words if  word.lower() in negative)
    return negative_score

# Directory where article files are stored
article_folder = "articles/"

# Calculate positive score for each file in the article folder
negative_scores = {}
for filename in os.listdir(article_folder):
    with open(os.path.join(article_folder, filename), 'r', encoding='utf-8') as file:
        text = file.read()
    negative_score = calculate_negative_score(text)
    negative_scores[filename] = negative_score

# Print positive scores for each file
for filename, score in sorted(negative_scores.items()):
    print(f"Positive score for {filename}: {score}")

Positive score for blackassign0001.txt: 5
Positive score for blackassign0002.txt: 35
Positive score for blackassign0003.txt: 28
Positive score for blackassign0004.txt: 79
Positive score for blackassign0005.txt: 12
Positive score for blackassign0006.txt: 31
Positive score for blackassign0007.txt: 48
Positive score for blackassign0008.txt: 13
Positive score for blackassign0009.txt: 54
Positive score for blackassign0010.txt: 72
Positive score for blackassign0011.txt: 23
Positive score for blackassign0012.txt: 28
Positive score for blackassign0013.txt: 17
Positive score for blackassign0014.txt: 31
Positive score for blackassign0015.txt: 31
Positive score for blackassign0016.txt: 31
Positive score for blackassign0017.txt: 16
Positive score for blackassign0018.txt: 14
Positive score for blackassign0019.txt: 10
Positive score for blackassign0020.txt: 4
Positive score for blackassign0021.txt: 51
Positive score for blackassign0022.txt: 14
Positive score for blackassign0023.txt: 19
Positive scor

In [21]:
# Function to calculate polarity score for a given text
def calculate_polarity_score(text):
    # Tokenize the text into words
    words = word_tokenize(text)

    # Calculate positive score
    positive_score = sum(1 for word in words if word.lower() in positive)

    # Calculate negative score
    negative_score = sum(1 for word in words if word.lower() in negative)

    # Calculate polarity score
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)

    return polarity_score

# Directory where article files are stored
article_folder = "articles/"

# Calculate polarity score for each file in the article folder
polarity_scores = {}
for filename in os.listdir(article_folder):
    with open(os.path.join(article_folder, filename), 'r', encoding='utf-8') as file:
        text = file.read()
    polarity_score = calculate_polarity_score(text)
    polarity_scores[filename] = polarity_score

# Print polarity scores for each file
for filename, score in sorted(polarity_scores.items()):
    print(f"Polarity score for {filename}: {score}")


Polarity score for blackassign0001.txt: 0.37499997656250145
Polarity score for blackassign0002.txt: 0.27835051259432464
Polarity score for blackassign0003.txt: 0.2112676026582028
Polarity score for blackassign0004.txt: -0.2950819647944101
Polarity score for blackassign0005.txt: 0.36842104293628836
Polarity score for blackassign0006.txt: 0.495934955317602
Polarity score for blackassign0007.txt: -0.24675324354865918
Polarity score for blackassign0008.txt: 0.45833332378472247
Polarity score for blackassign0009.txt: -0.1134020606865767
Polarity score for blackassign0010.txt: -0.0510948901379935
Polarity score for blackassign0011.txt: 0.47727272184917363
Polarity score for blackassign0012.txt: 0.504424774297126
Polarity score for blackassign0013.txt: 0.43333332611111125
Polarity score for blackassign0014.txt: -0.08771929670667901
Polarity score for blackassign0015.txt: 0.11428571265306126
Polarity score for blackassign0016.txt: 0.11428571265306126
Polarity score for blackassign0017.txt: 0.4

In [22]:
# Function to remove punctuation from text
import re
def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)

# Function to count words after removing stop words and punctuation
def count_cleaned_words(text):
    words = word_tokenize(text)
    cleaned_words = [word for word in words if word.lower() not in stopwords]
    cleaned_words = [remove_punctuation(word) for word in cleaned_words]
    cleaned_words = list(filter(None, cleaned_words))
    return len(cleaned_words)

In [23]:
# Function to calculate subjectivity score for a given text
def calculate_subjectivity_score(text):
    # Tokenize the text into words
    words = word_tokenize(text)

    # Calculate positive score
    positive_score = sum(1 for word in words if word in positive)

    # Calculate negative score
    negative_score = sum(1 for word in words if word in negative)

    # Calculate total words after cleaning
    total_words = count_cleaned_words(text)

    # Calculate subjectivity score
    subjectivity_score = (positive_score + negative_score) / ((total_words) + 0.000001)

    return subjectivity_score

# Directory where article files are stored
article_folder = "articles/"

# Calculate subjectivity score for each file in the article folder
subjectivity_scores = {}
for filename in os.listdir(article_folder):
    with open(os.path.join(article_folder, filename), 'r', encoding='utf-8') as file:
        text = file.read()
    subjectivity_score = calculate_subjectivity_score(text)
    subjectivity_scores[filename] = subjectivity_score

# Print subjectivity scores for each file
for filename, score in sorted(subjectivity_scores.items()):
    print(f"Subjectivity score for {filename}: {score}")


Subjectivity score for blackassign0001.txt: 0.03785488947048931
Subjectivity score for blackassign0002.txt: 0.10022026420680588
Subjectivity score for blackassign0003.txt: 0.0846354165564643
Subjectivity score for blackassign0004.txt: 0.1485411138613513
Subjectivity score for blackassign0005.txt: 0.06627680298971383
Subjectivity score for blackassign0006.txt: 0.09254709247129639
Subjectivity score for blackassign0007.txt: 0.10015898235268843
Subjectivity score for blackassign0008.txt: 0.06518282978508294
Subjectivity score for blackassign0009.txt: 0.121751025825238
Subjectivity score for blackassign0010.txt: 0.1551925319075933
Subjectivity score for blackassign0011.txt: 0.09032258054804024
Subjectivity score for blackassign0012.txt: 0.10489510479031458
Subjectivity score for blackassign0013.txt: 0.1305361302318505
Subjectivity score for blackassign0014.txt: 0.07391304337114052
Subjectivity score for blackassign0015.txt: 0.08207070696708244
Subjectivity score for blackassign0016.txt: 0.

In [13]:
# Function to calculate average sentence length for a given text
def calculate_avg_sentence_length(text):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)

    # Calculate total number of words
    total_words = sum(len(word_tokenize(sentence)) for sentence in sentences)

    # Calculate total number of sentences
    total_sentences = len(sentences)

    # Calculate average sentence length
    avg_sentence_length = total_words / (total_sentences + 0.000001)

    return avg_sentence_length

# Directory where article files are stored
article_folder = "articles/"

# Calculate average sentence length for each file in the article folder
avg_sentence_lengths = {}
for filename in os.listdir(article_folder):
    with open(os.path.join(article_folder, filename), 'r', encoding='utf-8') as file:
        text = file.read()
    avg_length = calculate_avg_sentence_length(text)
    avg_sentence_lengths[filename] = avg_length

# Print average sentence lengths for each file
for filename, avg_length in sorted(avg_sentence_lengths.items()):
    print(f"Average sentence length for {filename}: {avg_length}")


Average sentence length for blackassign0001.txt: 21.766665941111135
Average sentence length for blackassign0002.txt: 23.0121948413147
Average sentence length for blackassign0003.txt: 24.21311435716206
Average sentence length for blackassign0004.txt: 26.232142388711743
Average sentence length for blackassign0005.txt: 23.227272199380177
Average sentence length for blackassign0006.txt: 25.139784675916296
Average sentence length for blackassign0007.txt: 20.74603141672966
Average sentence length for blackassign0008.txt: 22.03846111464498
Average sentence length for blackassign0009.txt: 21.77419319719043
Average sentence length for blackassign0010.txt: 23.76470560276817
Average sentence length for blackassign0011.txt: 25.222221871913586
Average sentence length for blackassign0012.txt: 25.05952351119615
Average sentence length for blackassign0013.txt: 31.65517132223547
Average sentence length for blackassign0014.txt: 20.123287395571406
Average sentence length for blackassign0015.txt: 25.34374

In [14]:
def syllable_count(word):
    vowels = 'aeiouy'
    word = word.lower()
    count = 0
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith('e') and count > 1:
        count -= 1
    if count == 0:
        count += 1
    return count


In [15]:
# Function to calculate percentage of complex words for a given text
def calculate_percentage_complex_words(text):
    # Tokenize the text into words
    words = word_tokenize(text)

    # Calculate number of complex words
    complex_word_count = sum(1 for word in words if syllable_count(word) > 2)

    # Calculate total number of words
    total_words = len(words)

    # Calculate percentage of complex words
    percentage_complex_words = (complex_word_count / (total_words + 0.000001)) * 100

    return percentage_complex_words

# Directory where article files are stored
article_folder = "articles/"

# Calculate percentage of complex words for each file in the article folder
percentage_complex_words = {}
for filename in os.listdir(article_folder):
    with open(os.path.join(article_folder, filename), 'r', encoding='utf-8') as file:
        text = file.read()
    percentage = calculate_percentage_complex_words(text)
    percentage_complex_words[filename] = percentage

# Print percentage of complex words for each file
for filename, percentage in sorted(percentage_complex_words.items()):
    print(f"Percentage of complex words for {filename}: {percentage}")


Percentage of complex words for blackassign0001.txt: 15.007656944858105
Percentage of complex words for blackassign0002.txt: 20.084790662382197
Percentage of complex words for blackassign0003.txt: 26.404874728229604
Percentage of complex words for blackassign0004.txt: 25.11912864185219
Percentage of complex words for blackassign0005.txt: 19.373776889066754
Percentage of complex words for blackassign0006.txt: 24.6792129920106
Percentage of complex words for blackassign0007.txt: 19.66335117087731
Percentage of complex words for blackassign0008.txt: 27.748691075262922
Percentage of complex words for blackassign0009.txt: 28.148148127297667
Percentage of complex words for blackassign0010.txt: 22.32673266221449
Percentage of complex words for blackassign0011.txt: 24.22907487652584
Percentage of complex words for blackassign0012.txt: 20.14251780515795
Percentage of complex words for blackassign0013.txt: 21.786492350995108
Percentage of complex words for blackassign0014.txt: 19.196732458000863

In [16]:
# Function to calculate Fog Index for a given text
def calculate_fog_index(text):
    # Calculate average sentence length
    avg_sentence_length = calculate_avg_sentence_length(text)

    # Calculate percentage of complex words
    percentage_complex_words = calculate_percentage_complex_words(text)

    # Calculate Fog Index
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)

    return fog_index

# Directory where article files are stored
article_folder = "articles/"

# Calculate Fog Index for each file in the article folder
fog_indexes = {}
for filename in os.listdir(article_folder):
    with open(os.path.join(article_folder, filename), 'r', encoding='utf-8') as file:
        text = file.read()
    fog_index = calculate_fog_index(text)
    fog_indexes[filename] = fog_index

# Print Fog Index for each file
for filename, index in sorted(fog_indexes.items()):
    print(f"Fog Index for {filename}: {index}")


Fog Index for blackassign0001.txt: 14.709729154387697
Fog Index for blackassign0002.txt: 17.23879420147876
Fog Index for blackassign0003.txt: 20.247195634156668
Fog Index for blackassign0004.txt: 20.540508412225577
Fog Index for blackassign0005.txt: 17.040419635378772
Fog Index for blackassign0006.txt: 19.92759906717076
Fog Index for blackassign0007.txt: 16.163753035042788
Fog Index for blackassign0008.txt: 19.91486087596316
Fog Index for blackassign0009.txt: 19.968936529795243
Fog Index for blackassign0010.txt: 18.436575305993063
Fog Index for blackassign0011.txt: 19.780518699375772
Fog Index for blackassign0012.txt: 18.08081652654164
Fog Index for blackassign0013.txt: 21.376665469292234
Fog Index for blackassign0014.txt: 15.728007941428908
Fog Index for blackassign0015.txt: 18.472888245833754
Fog Index for blackassign0016.txt: 18.472888245833754
Fog Index for blackassign0017.txt: 17.380222493786174
Fog Index for blackassign0018.txt: 20.32154787854379
Fog Index for blackassign0019.txt

In [17]:
# Function to calculate average number of words per sentence for a given text
def calculate_avg_words_per_sentence(text):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)

    # Calculate total number of words
    total_words = sum(len(word_tokenize(sentence)) for sentence in sentences)

    # Calculate total number of sentences
    total_sentences = len(sentences)

    # Calculate average number of words per sentence
    avg_words_per_sentence = total_words / (total_sentences + 0.000001)

    return avg_words_per_sentence

# Directory where article files are stored
article_folder = "articles/"

# Calculate average number of words per sentence for each file in the article folder
avg_words_per_sentences = {}
for filename in os.listdir(article_folder):
    with open(os.path.join(article_folder, filename), 'r', encoding='utf-8') as file:
        text = file.read()
    avg_words_per_sentence = calculate_avg_words_per_sentence(text)
    avg_words_per_sentences[filename] = avg_words_per_sentence

# Print average number of words per sentence for each file
for filename, avg_words in sorted(avg_words_per_sentences.items()):
    print(f"Average number of words per sentence for {filename}: {avg_words}")


Average number of words per sentence for blackassign0001.txt: 21.766665941111135
Average number of words per sentence for blackassign0002.txt: 23.0121948413147
Average number of words per sentence for blackassign0003.txt: 24.21311435716206
Average number of words per sentence for blackassign0004.txt: 26.232142388711743
Average number of words per sentence for blackassign0005.txt: 23.227272199380177
Average number of words per sentence for blackassign0006.txt: 25.139784675916296
Average number of words per sentence for blackassign0007.txt: 20.74603141672966
Average number of words per sentence for blackassign0008.txt: 22.03846111464498
Average number of words per sentence for blackassign0009.txt: 21.77419319719043
Average number of words per sentence for blackassign0010.txt: 23.76470560276817
Average number of words per sentence for blackassign0011.txt: 25.222221871913586
Average number of words per sentence for blackassign0012.txt: 25.05952351119615
Average number of words per sentence

In [18]:
# Function to calculate complex word count for a given text
def calculate_complex_word_count(text):
    # Tokenize the text into words
    words = word_tokenize(text)

    # Calculate complex word count
    complex_word_count = sum(1 for word in words if syllable_count(word) > 2)

    return complex_word_count

# Directory where article files are stored
article_folder = "articles/"

# Calculate complex word count for each file in the article folder
complex_word_counts = {}
for filename in os.listdir(article_folder):
    with open(os.path.join(article_folder, filename), 'r', encoding='utf-8') as file:
        text = file.read()
    complex_count = calculate_complex_word_count(text)
    complex_word_counts[filename] = complex_count

# Print complex word count for each file
for filename, count in sorted(complex_word_counts.items()):
    print(f"Complex word count for {filename}: {count}")


Complex word count for blackassign0001.txt: 98
Complex word count for blackassign0002.txt: 379
Complex word count for blackassign0003.txt: 390
Complex word count for blackassign0004.txt: 369
Complex word count for blackassign0005.txt: 198
Complex word count for blackassign0006.txt: 577
Complex word count for blackassign0007.txt: 257
Complex word count for blackassign0008.txt: 318
Complex word count for blackassign0009.txt: 380
Complex word count for blackassign0010.txt: 451
Complex word count for blackassign0011.txt: 440
Complex word count for blackassign0012.txt: 424
Complex word count for blackassign0013.txt: 200
Complex word count for blackassign0014.txt: 282
Complex word count for blackassign0015.txt: 338
Complex word count for blackassign0016.txt: 338
Complex word count for blackassign0017.txt: 316
Complex word count for blackassign0018.txt: 306
Complex word count for blackassign0019.txt: 455
Complex word count for blackassign0020.txt: 125
Complex word count for blackassign0021.tx

In [25]:
from nltk.corpus import stopwords
nltk.download('stopwords')
import string

# Function to calculate word count for a given text
def calculate_word_count(text):
    # Tokenize the text into words
    words = word_tokenize(text)

    # Remove punctuations
    words = [word for word in words if word not in string.punctuation]

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.lower() not in stop_words]

    # Calculate word count
    word_count = len(words)

    return word_count

# Directory where article files are stored
article_folder = "articles/"

# Calculate word count for each file in the article folder
word_counts = {}
for filename in os.listdir(article_folder):
    with open(os.path.join(article_folder, filename), 'r', encoding='utf-8') as file:
        text = file.read()
    count = calculate_word_count(text)
    word_counts[filename] = count

# Print word count for each file
for filename, count in sorted(word_counts.items()):
    print(f"Word count for {filename}: {count}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Word count for blackassign0001.txt: 347
Word count for blackassign0002.txt: 1021
Word count for blackassign0003.txt: 833
Word count for blackassign0004.txt: 832
Word count for blackassign0005.txt: 573
Word count for blackassign0006.txt: 1322
Word count for blackassign0007.txt: 715
Word count for blackassign0008.txt: 665
Word count for blackassign0009.txt: 800
Word count for blackassign0010.txt: 958
Word count for blackassign0011.txt: 1008
Word count for blackassign0012.txt: 1161
Word count for blackassign0013.txt: 490
Word count for blackassign0014.txt: 793
Word count for blackassign0015.txt: 921
Word count for blackassign0016.txt: 921
Word count for blackassign0017.txt: 827
Word count for blackassign0018.txt: 813
Word count for blackassign0019.txt: 1116
Word count for blackassign0020.txt: 432
Word count for blackassign0021.txt: 780
Word count for blackassign0022.txt: 368
Word count for blackassign0023.txt: 933
Word count for blackassign0024.txt: 493
Word count for blackassign0025.txt:

In [26]:
# Function to calculate syllable count per word for a given text
def calculate_syllable_per_word(text):
    # Tokenize the text into words
    words = word_tokenize(text)

    # Initialize syllable count
    syllable_count_per_word = {}

    # Calculate syllable count for each word
    for word in words:
        syllable_count_per_word[word] = syllable_count(word)

    return syllable_count_per_word

# Directory where article files are stored
article_folder = "articles/"

# Calculate syllable count per word for each file in the article folder
syllable_per_words = {}
for filename in os.listdir(article_folder):
    with open(os.path.join(article_folder, filename), 'r', encoding='utf-8') as file:
        text = file.read()
    syllable_per_word = calculate_syllable_per_word(text)
    syllable_per_words[filename] = syllable_per_word

# Print syllable count per word for each file
for filename, syllable_per_word in sorted(syllable_per_words.items()):
    print(f"Syllable count per word for {filename}: {syllable_per_word}")


Syllable count per word for blackassign0001.txt: {'Title': 1, ':': 1, 'Rising': 2, 'IT': 1, 'cities': 2, 'and': 1, 'its': 1, 'impact': 2, 'on': 1, 'the': 1, 'economy': 4, ',': 1, 'environment': 4, 'infrastructure': 4, 'city': 2, 'life': 1, 'by': 1, 'year': 1, '2040': 1, '.': 1, '-': 1, 'Blackcoffer': 3, 'Insights': 2, 'Database': 3, 'Discovery': 4, 'Tool': 1, 'using': 2, 'OpenAI': 3, 'ML': 1, 'AI-based': 3, 'insurance': 3, 'premium': 2, 'model': 2, 'to': 1, 'predict': 2, 'be': 1, 'charged': 2, 'company': 3, 'Automate': 3, 'Data': 2, 'Management': 4, 'Process': 2, 'Realtime': 2, 'Kibana': 3, 'Dashboard': 2, 'for': 1, 'a': 1, 'financial': 3, 'tech': 1, 'firm': 1, 'How': 1, 'To': 1, 'Secure': 2, '(': 1, 'SSL': 1, ')': 1, 'Nginx': 1, 'with': 1, 'Let': 1, '’': 1, 's': 1, 'Encrypt': 2, 'Ubuntu': 3, 'Cloud': 1, 'VM': 1, 'GCP': 1, 'AWS': 1, 'Azure': 2, 'Linode': 2, 'Add': 1, 'Domain': 2, 'Deploy': 2, 'view': 1, 'React': 1, 'app': 1, 'Nextjs': 1, 'cloud': 1, 'such': 1, 'as': 1, 'Nodejs': 2, 'Gr

In [27]:
import re

# Function to calculate count of personal pronouns in a given text
def calculate_personal_pronouns(text):
    # Define the list of personal pronouns
    pronouns = ["I", "we", "my", "ours", "us"]

    # Compile a regex pattern to match the personal pronouns
    pattern = re.compile(r'\b(?:' + '|'.join(pronouns) + r')\b', re.IGNORECASE)

    # Find all matches of personal pronouns in the text
    matches = re.findall(pattern, text)

    # Count the occurrences of each personal pronoun
    pronoun_counts = {}
    for pronoun in pronouns:
        pronoun_counts[pronoun] = matches.count(pronoun)

    return pronoun_counts

# Directory where article files are stored
article_folder = "articles/"

# Calculate count of personal pronouns for each file in the article folder
personal_pronouns_counts = {}
for filename in os.listdir(article_folder):
    with open(os.path.join(article_folder, filename), 'r', encoding='utf-8') as file:
        text = file.read()
    pronouns_count = calculate_personal_pronouns(text)
    personal_pronouns_counts[filename] = pronouns_count

# Print count of personal pronouns for each file
for filename, pronouns_count in sorted(personal_pronouns_counts.items()):
    print(f"Personal pronouns count for {filename}: {pronouns_count}")


Personal pronouns count for blackassign0001.txt: {'I': 0, 'we': 1, 'my': 0, 'ours': 0, 'us': 1}
Personal pronouns count for blackassign0002.txt: {'I': 0, 'we': 2, 'my': 0, 'ours': 0, 'us': 2}
Personal pronouns count for blackassign0003.txt: {'I': 0, 'we': 12, 'my': 0, 'ours': 0, 'us': 2}
Personal pronouns count for blackassign0004.txt: {'I': 0, 'we': 4, 'my': 0, 'ours': 0, 'us': 1}
Personal pronouns count for blackassign0005.txt: {'I': 2, 'we': 4, 'my': 0, 'ours': 0, 'us': 1}
Personal pronouns count for blackassign0006.txt: {'I': 0, 'we': 6, 'my': 0, 'ours': 0, 'us': 1}
Personal pronouns count for blackassign0007.txt: {'I': 0, 'we': 0, 'my': 0, 'ours': 0, 'us': 2}
Personal pronouns count for blackassign0008.txt: {'I': 0, 'we': 3, 'my': 0, 'ours': 0, 'us': 1}
Personal pronouns count for blackassign0009.txt: {'I': 0, 'we': 2, 'my': 0, 'ours': 0, 'us': 1}
Personal pronouns count for blackassign0010.txt: {'I': 0, 'we': 8, 'my': 0, 'ours': 0, 'us': 1}
Personal pronouns count for blackassign

In [28]:
# Function to calculate average word length for a given text
def calculate_avg_word_length(text):
    # Tokenize the text into words
    words = word_tokenize(text)

    # Calculate total number of characters in all words
    total_characters = sum(len(word) for word in words)

    # Calculate total number of words
    total_words = len(words)

    # Calculate average word length
    avg_word_length = total_characters / (total_words + 0.000001)

    return avg_word_length

# Directory where article files are stored
article_folder = "articles/"

# Calculate average word length for each file in the article folder
avg_word_lengths = {}
for filename in os.listdir(article_folder):
    with open(os.path.join(article_folder, filename), 'r', encoding='utf-8') as file:
        text = file.read()
    avg_length = calculate_avg_word_length(text)
    avg_word_lengths[filename] = avg_length

# Print average word length for each file
for filename, avg_length in sorted(avg_word_lengths.items()):
    print(f"Average word length for {filename}: {avg_length}")


Average word length for blackassign0001.txt: 4.405819288811915
Average word length for blackassign0002.txt: 4.8595654452254555
Average word length for blackassign0003.txt: 5.356127281410882
Average word length for blackassign0004.txt: 5.208985701014986
Average word length for blackassign0005.txt: 4.983365944243282
Average word length for blackassign0006.txt: 5.308383231262454
Average word length for blackassign0007.txt: 4.75899004991661
Average word length for blackassign0008.txt: 5.395287953407252
Average word length for blackassign0009.txt: 5.383703699715775
Average word length for blackassign0010.txt: 4.821287126326095
Average word length for blackassign0011.txt: 5.20429515131922
Average word length for blackassign0012.txt: 5.086935864566777
Average word length for blackassign0013.txt: 4.998910669935827
Average word length for blackassign0014.txt: 4.803948260855038
Average word length for blackassign0015.txt: 5.0055487022160605
Average word length for blackassign0016.txt: 5.00554870

In [32]:
import os
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

# Load stop words
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Function to calculate positive score
def calculate_positive_score(text):
    return sum(1 for word in word_tokenize(text) if word.lower() in positive_words)

# Function to calculate negative score
def calculate_negative_score(text):
    return sum(1 for word in word_tokenize(text) if word.lower() in negative_words)

# Function to calculate polarity score
def calculate_polarity_score(positive_score, negative_score):
    return (positive_score - negative_score) / (positive_score + negative_score + 0.000001)

# Function to calculate subjectivity score
def calculate_subjectivity_score(positive_score, negative_score, total_words):
    return (positive_score + negative_score) / (total_words + 0.000001)

# Function to calculate average sentence length
def calculate_avg_sentence_length(text):
    sentences = sent_tokenize(text)
    total_words = sum(len(word_tokenize(sentence)) for sentence in sentences)
    total_sentences = len(sentences)
    return total_words / (total_sentences + 0.000001)

# Function to calculate percentage of complex words
def calculate_percentage_complex_words(text):
    words = word_tokenize(text)
    complex_words = sum(1 for word in words if syllable_count(word) > 2)
    return (complex_words / (len(words) + 0.000001)) * 100

# Function to calculate fog index
def calculate_fog_index(avg_sentence_length, percentage_complex_words):
    return 0.4 * (avg_sentence_length + percentage_complex_words)

# Function to calculate average number of words per sentence
def calculate_avg_words_per_sentence(text):
    words = word_tokenize(text)
    total_words = len(words)
    total_sentences = len(sent_tokenize(text))
    return total_words / (total_sentences + 0.000001)

# Function to calculate complex word count
def calculate_complex_word_count(text):
    words = word_tokenize(text)
    return sum(1 for word in words if syllable_count(word) > 2)

# Function to calculate syllable count per word
def syllable_count(word):
    vowels = 'aeiouy'
    word = word.lower()
    count = 0
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith('e'):
        count -= 1
    if count == 0:
        count += 1
    return count

# Function to calculate personal pronouns count
def calculate_personal_pronouns(text):
    pronouns = ['I', 'we', 'my', 'ours', 'us']
    return sum(1 for word in word_tokenize(text) if word.lower() in pronouns)

# Function to calculate average word length
def calculate_avg_word_length(text):
    words = word_tokenize(text)
    total_characters = sum(len(word) for word in words)
    total_words = len(words)
    return total_characters / (total_words + 0.000001)

# Function to remove punctuation from text
def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)

# Function to count words after removing stop words and punctuation
def count_cleaned_words(text):
    words = word_tokenize(text)
    cleaned_words = [word for word in words if word.lower() not in stop_words]
    cleaned_words = [remove_punctuation(word) for word in cleaned_words]
    cleaned_words = list(filter(None, cleaned_words))
    return len(cleaned_words)

# Directory where article files are stored
article_folder = "articles/"

# Dictionary to store results
results = {}

# Calculate variables for each file in the article folder
for filename in os.listdir(article_folder):
    with open(os.path.join(article_folder, filename), 'r', encoding='utf-8') as file:
        text = file.read()
    cleaned_text = remove_punctuation(text)
    total_words = count_cleaned_words(cleaned_text)
    positive_score = calculate_positive_score(cleaned_text)
    negative_score = calculate_negative_score(cleaned_text)
    polarity_score = calculate_polarity_score(positive_score, negative_score)
    subjectivity_score = calculate_subjectivity_score(positive_score, negative_score, total_words)
    avg_sentence_length = calculate_avg_sentence_length(cleaned_text)
    percentage_complex_words = calculate_percentage_complex_words(cleaned_text)
    fog_index = calculate_fog_index(avg_sentence_length, percentage_complex_words)
    avg_words_per_sentence = calculate_avg_words_per_sentence(cleaned_text)
    complex_word_count = calculate_complex_word_count(cleaned_text)
    personal_pronouns_count = calculate_personal_pronouns(cleaned_text)
    avg_word_length = calculate_avg_word_length(cleaned_text)

    # Store results for each file in the dictionary
    results[filename] = {
        "POSITIVE SCORE": positive_score,
        "NEGATIVE SCORE": negative_score,
        "POLARITY SCORE": polarity_score,
        "SUBJECTIVITY SCORE": subjectivity_score,
        "AVG SENTENCE LENGTH": avg_sentence_length,
        "PERCENTAGE OF COMPLEX WORDS": percentage_complex_words,
        "FOG INDEX": fog_index,
        "AVG NUMBER OF WORDS PER SENTENCE": avg_words_per_sentence,
        "COMPLEX WORD COUNT": complex_word_count,
        "WORD COUNT": total_words,
        "SYLLABLE PER WORD": 0,  # You need to implement syllable counting
        "PERSONAL PRONOUNS": personal_pronouns_count,
        "AVG WORD LENGTH": avg_word_length
    }

# Create a DataFrame from the dictionary
df = pd.DataFrame.from_dict(results, orient='index')

# Save the DataFrame to an Excel file
df.to_excel("output.xlsx")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [1]:
2*8

16