In [1]:
import os
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from openpyxl import Workbook
from pdfminer.high_level import extract_text
import pyphen

# Downloading the NLTK 'punkt' and 'stopwords' set
nltk.download('punkt')
nltk.download('stopwords')

# URL of the Loughran-McDonald dictionary CSV file
url = 'https://drive.google.com/u/0/uc?id=17CmUZM9hGUdGYjCXcjQLyybjTrcjrhik&export=download'

# Loading the Loughran-McDonald dictionary
loughran_mcdonald_dict = pd.read_csv(url)

# Define the column names for words and categories
word_column = 'Word'
category_columns = ['Negative', 'Positive', 'Uncertainty', 'Litigious', 'Constraining', 'Strong_Modal', 'Weak_Modal', 'Syllables']

# Initialize a Pyphen object for syllable counting
dic = pyphen.Pyphen(lang='en')

def sentiment_analysis(text, dictionary):
    # Tokenizing the text
    tokens = word_tokenize(text)

    # Removing the stopwords
    tokens = [word for word in tokens if not word.lower() in stopwords.words('english')]

    # Initializing a dictionary to store the results
    results = {category: 0 for category in category_columns}

    # Counting the occurrences of each word
    for word in tokens:
        if word.upper() in dictionary[word_column].values:
            for category in category_columns:
                if dictionary.loc[dictionary[word_column] == word.upper(), category].values[0] > 0:
                    results[category] += 1

    return results

def count_syllables(words):
    # Count syllables in list of words
    return sum([len(dic.inserted(word).split('-')) for word in words])

def count_words_sents_syllables(text):
    # Tokenizing the text
    sentences = sent_tokenize(text)
    tokens = word_tokenize(text)

    # Counting syllables
    syllables = count_syllables(tokens)

    return len(tokens), len(sentences), syllables

pdf_directory = r'/Users/flaviomotta/Desktop/fed_minutes'
excel_directory = r'/Users/flaviomotta/Desktop/fed_minutes_excel'

# Create a workbook and add a worksheet to it
wb = Workbook()
ws = wb.active

# Write the header row in the worksheet
ws.append(['Filename'] + category_columns + ['Word Count', 'Sentence Count', 'Syllable Count', 'LMSI'])


for filename in os.listdir(pdf_directory):
    if filename.endswith('.pdf'):
        # Use pdfminer to extract text
        text = extract_text(os.path.join(pdf_directory, filename))

        # Perform sentiment analysis
        results = sentiment_analysis(text, loughran_mcdonald_dict)

        # Count words, sentences, and syllables
        word_count, sent_count, syllable_count = count_words_sents_syllables(text)

        # Write the results to the worksheet
        ws.append([filename] + [results[category] for category in category_columns] + [word_count, sent_count, syllable_count, results['Positive'] / (results['Positive'] + results['Negative'])])

        print(filename)
        print(results)
        print(f'Word Count: {word_count}, Sentence Count: {sent_count}, Syllable Count: {syllable_count}')

# Save the workbook as an Excel file
output_file = os.path.join(excel_directory, 'combined_results.xlsx')
wb.save(output_file)
print(f"Results saved to: {output_file}")


ModuleNotFoundError: No module named 'nltk'