In [None]:
import os
import re

# Filepath configuration
filepath2 = ""
filepath3 = ""

# Dictionary files
NEGATIVE = os.path.join(filepath2, "NEGATIVE.txt")
POSITIVE = os.path.join(filepath2, "POSITIVE.txt")
SD = os.path.join(filepath3, "SampleData.txt")  # Output file for sentiment analysis results

# Load the sentiment word lists
with open(NEGATIVE, 'r') as f:
    NEGATIVE = [x.strip().lower() for x in f.readlines()]

with open(POSITIVE, 'r') as f:
    POSITIVE = [x.strip().lower() for x in f.readlines()]

# Acquisition-related words
acq = ["acquisition", "merger", "buyout"]

# Helper function to clean and tokenize text
def clean_text(text):
    text = re.sub(r'\d', '', text)  # Remove digits
    text = re.sub(r'[,.:?$%()"\'&\-_\[\];/`]', ' ', text)  # Remove unwanted characters
    return text.lower().split()

# Function to perform sentiment analysis on the provided MD&A text
def analyze_mda_text(mda_text):
    # Clean and tokenize the MD&A text
    words = clean_text(mda_text)
    total_words = len(words)

    # Initialize counters
    positive_count = sum(1 for word in words if word in POSITIVE)
    negative_count = sum(1 for word in words if word in NEGATIVE)
    acquisition_count = sum(1 for word in words if word in acq)

    # Prepare result
    result = {
        "total_words": total_words,
        "positive_words": positive_count,
        "negative_words": negative_count,
        "acquisition_mentions": acquisition_count
    }

    # Print result
    print("Sentiment Analysis Results:")
    print(f"Total words: {total_words}")
    print(f"Positive words: {positive_count}")
    print(f"Negative words: {negative_count}")
    print(f"Acquisition mentions: {acquisition_count}")

    return result

# Example MD&A text (replace this with your actual extracted MD&A section)

"""

# Perform sentiment analysis
analysis_result = analyze_mda_text(mda_text)

# Save the results to file
with open(SD, 'a') as f:
    f.write(f"Total words: {analysis_result['total_words']}\n")
    f.write(f"Positive words: {analysis_result['positive_words']}\n")
    f.write(f"Negative words: {analysis_result['negative_words']}\n")
    f.write(f"Acquisition mentions: {analysis_result['acquisition_mentions']}\n")


Sentiment Analysis Results:
Total words: 4441
Positive words: 56
Negative words: 62
Acquisition mentions: 15


In [None]:
#! pip install textstat
import torch
import textstat
from transformers import BertForSequenceClassification, BertTokenizer

tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')
model = BertForSequenceClassification.from_pretrained('ProsusAI/finbert')




In [None]:
import pandas as pd
import numpy as np
mdata = pd.read_csv("compiled_dataset.csv")
mda_column = np.asarray(mdata["MDA Data"])

In [None]:
import torch

def get_input_ids_and_attention_mask_chunk(tokens, chunksize=512):
    input_id_chunks = list(tokens['input_ids'][0].split(chunksize - 2))
    attention_mask_chunks = list(tokens['attention_mask'][0].split(chunksize - 2))

    # Create padding and special token tensors once (instead of per loop iteration)
    cls_token = torch.tensor([101])  # CLS token
    sep_token = torch.tensor([102])  # SEP token
    pad_token = torch.tensor([0])    # Padding token

    for i in range(len(input_id_chunks)):
        input_id_chunks[i] = torch.cat([cls_token, input_id_chunks[i], sep_token])
        attention_mask_chunks[i] = torch.cat([torch.tensor([1]), attention_mask_chunks[i], torch.tensor([1])])

        pad_length = chunksize - input_id_chunks[i].shape[0]

        if pad_length > 0:
            # Padding input ids and attention masks
            input_id_chunks[i] = torch.cat([input_id_chunks[i], pad_token.repeat(pad_length)])
            attention_mask_chunks[i] = torch.cat([attention_mask_chunks[i], pad_token.repeat(pad_length)])

    return input_id_chunks, attention_mask_chunks

def get_sentiment_analysis(i,chunksize=512):
    with torch.no_grad():  # Disable gradients to save memory during inference
        tokens = tokenizer.encode_plus(mda_column[i], add_special_tokens=False, return_tensors='pt')

        # Use chunksize defined here
        input_id_chunks, attention_mask_chunks = get_input_ids_and_attention_mask_chunk(tokens, chunksize)

        # Stack the chunks to form input tensors
        input_ids = torch.stack(input_id_chunks).long()
        attention_mask = torch.stack(attention_mask_chunks).int()

        # Prepare input dictionary
        input_dict = {
            'input_ids': input_ids,
            'attention_mask': attention_mask
        }

        # Perform the forward pass with the model
        outputs = model(**input_dict)
        probabilities = torch.nn.functional.softmax(outputs[0], dim=-1)

        # Compute mean probabilities
        mean_probabilities = probabilities.mean(dim=0)

        # Return sentiment class with the highest mean probability
        return torch.argmax(mean_probabilities).item()


In [None]:

# for i in range(2):
    # print(get_sentiment_analysis(i))

print(get_sentiment_analysis(0))

2


In [None]:
import pandas as pd

In [None]:
import csv
import re
import string

def utf8len(s):
    """Helper function to get the size of a string."""
    return len(s.encode("utf-8"))

# Load your master dictionary file
master_dictionary_file = "Loughran-McDonald_MasterDictionary_1993-2021.csv"
co = 0
# Sentiment output fields
SENTIMENT_OUTPUT_FIELDS = {
    "Negative": 1,
    "Positive": 1,
    "Uncertainty": 1,
    "Litigious": 1,
    "Strong_Modal": 1,
    "Weak_Modal": 1,
    "Constraining": 1
}

# Load the master dictionary into a Python dictionary
master_dictionary = {}
with open(master_dictionary_file) as csv_file:
    csv_reader = csv.DictReader(csv_file, delimiter=",")
    for row in csv_reader:
        master_dictionary[row["Word"]] = row
print(f"Master dictionary has {len(master_dictionary)} words.")

# Output fields
FIXED_OUTPUT_FIELDS = [
    "ID",  # 0
    "Name",  # 1
    "Filing_Date",  # 2
]

for key in SENTIMENT_OUTPUT_FIELDS.keys():
    FIXED_OUTPUT_FIELDS.append(f"{key}")
FIXED_OUTPUT_FIELDS.append( "Readability")


data = [FIXED_OUTPUT_FIELDS]

# CSV file containing MD&A data
mda_data_file = "compiled_dataset.csv"  # Change this to your actual file path

# Open the CSV containing MD&A text


    # Iterate over each row of the input CSV

for i in range (len(mdata)):
    row = mdata.iloc[i]
    cik = row['Name']
    filing_date = row['Date']
    mda_text = row['MDA Data'].upper()  # Convert the MD&A section to uppercase
    if(len(mda_text)<50):
        continue
    # Customize tokenization here
    tokens = re.findall(r"\w+", mda_text)
    co = co + 1
    vocabulary = {}
    output_data = [0] * len(FIXED_OUTPUT_FIELDS)
    output_data[0] = co  # Accession_No
    output_data[1] = cik  # CIK
    output_data[2] = filing_date  # Filing_Date
    #output_data[3] = utf8len(mda_text)  # Text_Size (Bytes)

    #output_data[5] = len(re.findall("[A-Z]", mda_text))  # Number_of_Alphabetic
    #output_data[6] = len(re.findall("[0-9]", mda_text))  # Number_of_Digits
    number_doc = re.sub(r"(?!=[0-9])(\.|,)(?=[0-9])", "", mda_text)
    number_doc = number_doc.translate(
        str.maketrans(string.punctuation, " " * len(string.punctuation))
    )
    #output_data[7] = len(re.findall(r"\b[-+\(]?[$€£]?[-+(]?\d+\)?\b", number_doc))  # Number_of_Numbers

    total_syllables = 0
    word_length = 0
    total_tokens = 0

    # Sentiment and word count analysis
    for token in tokens:
        if not token.isdigit() and len(token) > 1 and master_dictionary.get(token):
            total_tokens += 1
            word_length += len(token)

            if token not in vocabulary:
                vocabulary[token] = 1

            total_syllables += int(master_dictionary[token]["Syllables"])

            # Check sentiment for each token
            for key, item in SENTIMENT_OUTPUT_FIELDS.items():
                if master_dictionary[token][key] != "0":
                    output_data[FIXED_OUTPUT_FIELDS.index(key)] += item

    #output_data[4] = total_tokens  # Number_of_Words
    #output_data[8] = total_syllables / total_tokens  # Average_Syllables
    #output_data[9] = word_length / total_tokens  # Average_Word_Length
    #output_data[10] = len(vocabulary)  # Vocabulary

    # Convert values to percentage for sentiment columns
    for j in range(3,10):
        output_data[j] = (output_data[j] / total_tokens) * 100
    output_data[10] = textstat.gunning_fog(mda_text)

    print(f"Processed Row_No: {i}")
    data.append(output_data)

# Write the output to a CSV file
output_file = "sentiment_analysis_results.csv"
with open(output_file, mode='w', newline='', encoding='utf-8') as result_file:
    writer = csv.writer(result_file)
    writer.writerows(data)

print(f"Sentiment analysis complete. Results saved to {output_file}")


Master dictionary has 86553 words.
Processed Row_No: 0
Processed Row_No: 1
Processed Row_No: 2
Processed Row_No: 3
Processed Row_No: 4
Processed Row_No: 5
Processed Row_No: 6
Processed Row_No: 7
Processed Row_No: 8
Processed Row_No: 9
Processed Row_No: 10
Processed Row_No: 11
Processed Row_No: 12
Processed Row_No: 13
Processed Row_No: 14
Processed Row_No: 15
Processed Row_No: 16
Processed Row_No: 17
Processed Row_No: 18
Processed Row_No: 19
Processed Row_No: 20
Processed Row_No: 21
Processed Row_No: 22
Processed Row_No: 23
Processed Row_No: 24
Processed Row_No: 25
Processed Row_No: 26
Processed Row_No: 27
Processed Row_No: 28
Processed Row_No: 29
Processed Row_No: 30
Processed Row_No: 31
Processed Row_No: 32
Processed Row_No: 33
Processed Row_No: 34
Processed Row_No: 35
Processed Row_No: 36
Processed Row_No: 37
Processed Row_No: 45
Processed Row_No: 46
Processed Row_No: 47
Processed Row_No: 48
Processed Row_No: 49
Processed Row_No: 50
Processed Row_No: 51
Processed Row_No: 52
Processed

In [None]:
import pandas as pd

# Step 1: Read the original compiled dataset and remove the 'MDA Data' column
compiled_df = pd.read_csv('compiled_dataset.csv')

# Step 2: Remove rows where 'MDA Data' column length is less than 60
compiled_df = compiled_df[compiled_df['MDA Data'].str.len() >= 60]

# Step 3: Drop the 'MDA Data' column
compiled_df = compiled_df.drop(columns=['MDA Data'])

# Step 4: Read the sentiment analysis results (from the 4th column to the last)
sentiment_df = pd.read_csv('sentiment_analysis_results.csv')

# Select columns from 4th till last using iloc (1-indexed position means 4th column corresponds to iloc[:, 3:])
sentiment_columns = sentiment_df.iloc[:, 3:]

# Step 5: Concatenate the dataframes column-wise
final_df = pd.concat([compiled_df, sentiment_columns], axis=1)

# Step 6: Save the result to a new CSV file
final_df.to_csv('final_dataset.csv', index=False)

print("Final dataset created and saved as 'final_dataset.csv'.")


Final dataset created and saved as 'final_dataset.csv'.


In [None]:
# Open the CSV file in 'w+' mode
with open("output_file.csv", "w") as file:
    write = csv.writer(file)
    write.writerows(data)