In [18]:
pip install python-docx

Collecting python-docxNote: you may need to restart the kernel to use updated packages.

  Using cached python-docx-0.8.11.tar.gz (5.6 MB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: python-docx
  Building wheel for python-docx (setup.py): started
  Building wheel for python-docx (setup.py): finished with status 'done'
  Created wheel for python-docx: filename=python_docx-0.8.11-py3-none-any.whl size=184519 sha256=537e4fefbd8a4cde5070579346df9d85af9a7dff1abc8114a86b6069a7f4c05a
  Stored in directory: c:\users\aadhavan\appdata\local\pip\cache\wheels\65\e1\9b\0c38fe6cfe02a9fe31cb6b4efd90985f17354d7f77872f2def
Successfully built python-docx
Installing collected packages: python-docx
Successfully installed python-docx-0.8.11


In [2]:
import nltk
import os
import re
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.probability import FreqDist

# Download required NLTK resources
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Aadhavan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Aadhavan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
from docx import Document

# Path to the .docx file
pos_file_path = "MasterDictionary\positive-words.docx"
neg_file_path = "MasterDictionary\_negative-words.docx"

# Open the document
pos_doc = Document(pos_file_path)
neg_doc = Document(neg_file_path)

# Initialize a variable to store extracted text
positive_words = ""
negative_words = ""

# Extract text from paragraphs
for paragraph in pos_doc.paragraphs:
    positive_words += paragraph.text + "\n"
    
for paragraph in neg_doc.paragraphs:
    negative_words += paragraph.text + "\n"
    


In [5]:
# Define functions for calculations
def calculate_sentiment(text):

    # Clean text using stopwords
    stop_words = set(stopwords.words("english"))
    words = word_tokenize(text)
    cleaned_words = [word.lower() for word in words if word.lower() not in stop_words and word.isalnum()]

    # Calculate positive, negative, polarity, and subjectivity scores
    positive_score = sum(1 for word in cleaned_words if word in positive_words)
    negative_score = sum(1 for word in cleaned_words if word in negative_words)
    total_words = len(cleaned_words)
    polarity_score = (positive_score - negative_score) / (total_words + 0.000001) # to avoid diviso by zero.
    subjectivity_score = (positive_score + negative_score) / (total_words + 0.000001)
    
    return positive_score, negative_score, polarity_score, subjectivity_score 

def calculate_readability(text):
    sentences = sent_tokenize(text)
    total_words = sum(len(word_tokenize(sentence)) for sentence in sentences)
    total_sentences = len(sentences)
    average_sentence_length = total_words / total_sentences
    
    complex_word_count = sum(1 for sentence in sentences for word in word_tokenize(sentence) if syllable_count(word) > 2)
    percentage_complex_words = complex_word_count / total_words
    
    fog_index = 0.4 * (average_sentence_length + percentage_complex_words)
    
    return fog_index

def syllable_count(word):
    vowels = "aeiouAEIOU"
    syllables = 0
    prev_char_is_vowel = False
    
    for char in word:
        if char in vowels:
            if not prev_char_is_vowel:
                syllables += 1
            prev_char_is_vowel = True
        else:
            prev_char_is_vowel = False
    
    if word.endswith(("es", "ed")):
        syllables -= 1
    
    return syllables

def calculate_word_count(text):
    words = word_tokenize(text)
    cleaned_words = [word for word in words if word.isalnum()]
    return len(cleaned_words)

def calculate_average_word_length(text):
    words = word_tokenize(text)
    total_characters = sum(len(word) for word in words)
    return total_characters / len(words) 

def calculate_complex_word_percentage(text):
    words = word_tokenize(text)
    cleaned_words = [word for word in words if word.isalnum()]
    
    complex_word_count = sum(1 for word in cleaned_words if syllable_count(word) > 2)
    total_word_count = len(cleaned_words)
    
    complex_word_percentage = (complex_word_count / total_word_count) * 100
    
    return complex_word_percentage, complex_word_count


In [6]:
# Define folder path
folder_path = "extracted_articles"

# Loop through each text file
results = []

In [7]:
for text_file in os.listdir(folder_path):
    if text_file.endswith(".txt"):
        file_path = os.path.join(folder_path, text_file)
        
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()

            # Calculate sentiment analysis scores
            positive_score, negative_score, polarity_score, subjectivity_score = calculate_sentiment(text)

            # calculate syllables
            syllable_per_word =[{word:syllable_count(word)} for word in words]

            # Calculate other metrics
            fog_index = calculate_readability(text)
            word_count = calculate_word_count(text)
            average_word_length = calculate_average_word_length(text)
            complex_word_percentage, complex_word_count = calculate_complex_word_percentage(text)


            # Append results to list
            results.append({
                "File Name": text_file,
                "POSITIVE SCORE":positive_score,
                "NEGATIVE SCORE":negative_score,
                "POLARITY SCORE":polarity_score,
                "SUBJECTIVITY SCORE":subjectivity_score,
                "FOG INDEX": fog_index,
                "WORD COOUNT": word_count,
                "AVERAGE WORD PER LENGTH": average_word_length,
                "SYLLABLE PER WORD":syllable_per_word,
                "COMPLEX WORD COUNT":complex_word_count,
                "PERCENTAGE OF COMPLEX WORDS":complex_word_percentage
            })

# Display or save the results
for result in results:
    print(result)

# Optionally, save results to a CSV file
import pandas as pd

results_df = pd.DataFrame(results)
results_csv_path = "results.csv"
results_df.to_csv(results_csv_path, index=False)

{'File Name': '100.txt', 'POSITIVE SCORE': 196, 'NEGATIVE SCORE': 247, 'POLARITY SCORE': -0.05120481922569797, 'SUBJECTIVITY SCORE': 0.4447791160192981, 'FOG INDEX': 9.171781765010511, 'WORD COOUNT': 1713, 'AVERAGE WORD PER LENGTH': 4.907301916105645, 'SYLLABLE PER WORD': [{'This': 1}, {'is': 1}, {'an': 1}, {'example': 3}, {'sentence': 3}, {'walked': 1}, {'.': 0}], 'COMPLEX WORD COUNT': 403, 'PERCENTAGE OF COMPLEX WORDS': 23.525977816695857}
{'File Name': '101.txt', 'POSITIVE SCORE': 194, 'NEGATIVE SCORE': 246, 'POLARITY SCORE': -0.052261306480139394, 'SUBJECTIVITY SCORE': 0.44221105483194867, 'FOG INDEX': 9.143602750190986, 'WORD COOUNT': 1708, 'AVERAGE WORD PER LENGTH': 4.9111688311688315, 'SYLLABLE PER WORD': [{'This': 1}, {'is': 1}, {'an': 1}, {'example': 3}, {'sentence': 3}, {'walked': 1}, {'.': 0}], 'COMPLEX WORD COUNT': 402, 'PERCENTAGE OF COMPLEX WORDS': 23.536299765807964}
{'File Name': '102.txt', 'POSITIVE SCORE': 195, 'NEGATIVE SCORE': 248, 'POLARITY SCORE': -0.0531594783819