In [160]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize import RegexpTokenizer


In [161]:
# Load the stop words list
stop_words = set(stopwords.words('english'))

# Load the positive and negative words dictionaries
positive_words = set(open("MasterDictionary/positive-words.txt", "r").read().splitlines())
negative_words = set(open("MasterDictionary/negative-words.txt", "r",encoding = "ISO-8859-1").read().splitlines())


In [162]:
# Function to clean and preprocess the text
def preprocess_text(text):
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(text.lower())
    filtered_words = [word for word in words if word not in stop_words]
    return " ".join(filtered_words)

# Function to count syllables in a word
def count_syllables(word):
    vowels = 'aeiouy'
    count = 0
    word = word.lower().strip(".:;?!")
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith('e'):
        count -= 1
    if word.endswith('le'):
        count += 1
    if count == 0:
        count += 1
    return count

In [144]:
# Function to compute variables for each article
def compute_variables(text):
    cleaned_text = preprocess_text(text)
    sentences = sent_tokenize(cleaned_text)
    words = word_tokenize(cleaned_text)

    # Positive Score
    positive_score = len([word for word in words if word in positive_words])

    # Negative Score
    negative_score = len([word for word in words if word in negative_words])

    # Polarity Score
    polarity_score = (positive_score - negative_score) / (positive_score + negative_score + 0.000001)

    # Subjectivity Score
    subjectivity_score = (positive_score + negative_score) / (len(words) + 0.000001)

    # Average Sentence Length
    avg_sentence_length = len(words) / len(sentences)

    # Percentage of Complex Words
    complex_words = [word for word in words if count_syllables(word) > 2]
    percentage_complex_words = len(complex_words) / len(words)

    # Fog Index
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)

    # Average Number of Words Per Sentence
    avg_words_per_sentence = len(words) / len(sentences)

    # Complex Word Count
    complex_word_count = len(complex_words)

    # Word Count
    word_count = len(words)

    # Syllables Per Word
    syllables_per_word = sum(count_syllables(word) for word in words) / len(words)

    # Personal Pronouns
    personal_pronouns = ['i', 'we', 'my', 'ours', 'us']
    personal_pronoun_count = len([word for word in words if word.lower() in personal_pronouns])

    # Average Word Length
    avg_word_length = sum(len(word) for word in words) / len(words)

    return positive_score, negative_score, polarity_score, subjectivity_score, avg_sentence_length, percentage_complex_words, fog_index, avg_words_per_sentence, complex_word_count, word_count, syllables_per_word, personal_pronoun_count, avg_word_length


In [163]:
# Read the input data from the Excel file
input_data = pd.read_excel('input.xlsx')

# Create an empty DataFrame for the output
output_data = pd.DataFrame(columns=['URL_ID','URL', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE', 'SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX', 'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT', 'WORD COUNT', 'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH'])

In [164]:
# Process each article
for index, row in input_data.iterrows():
    url_id = row['URL_ID']
    url=row['URL']
    article_text = open(f'{url_id}.txt', 'r').read()
    
    # Compute variables for the article
    variables = compute_variables(article_text)
    
    # Add the computed variables to the output DataFrame
    output_data.loc[index] = [url_id]+[url] + list(variables)  # type: ignore
    

# Save the output to the Excel file
output_data.to_excel('Output Data Structure.xlsx', index=False)

