# Introduction
In this jupyter notebook, all utility scripts for the Narcissism Project are showcased diligently. Most of the following code blocks have proper documentation, and require little to no modification in order to run on most computers.

# analyst.py
This script reads the corresponding file that is parsed as an argument, and calculate the occurences of "I", "me", "my", "mine", "myself", "we", "us", "our", "ours", "ourselves" keywords. In addition, a narcissism ratio is then produced by testing the singular to the plural pronounts. Finally, the results are stored in a dictionary data structure for later manipulation.

### Package Imports
- `re`: regular expression package for text cleaning.
- `os`: package for path manipulation between folders.

In [1]:
# analyst.py

# Packages
import re
import os

def analyst(filename):
    '''
    Function which reads a .txt file (in this case a letter to shareholders) and counts the occurences 
    of singular and plural pronouns. It then calculates the normalized narcissism ration in the document
    by testing singular / plural pronounts. Lastly, it stores the results in a dictionary data structure
    for easier manipulation by other python scripts.

    Input:
        filename: relevant path of the .txt  file of a singular letter to shareholders.

    Output:
        dictionary data structure  with the following items:
            "filename": the name of the file (e.g. letter_2023_387.txt)
            "I": occurences of "I" in the document.
            "me": occurences of "me" in the document.
            "my": occurences of "my" in the document.
            "mine": occurences of "mine" in the document.
            "we": occurences of "we" in the document.
            "us": occurences of "us" in the document.
            "our": occurences of "our" in the document.
            "ours": occurences of "ours" in the document.
            "ourselves": occurences of "ourselves" in the document.
            "word_count": number of words in the document.
            "narc_ratio": the ratio of singular to plural pronouns.
    '''
    # Do main operations in a try-except-finally block, in order to avoid premature termination of 
    # the program due to unexpected errors.
    try:
        # Open desired file and read its contents
        file = open(filename, "r", encoding="utf-8", errors="ignore")
        text = file.read()

        # Save the text in lowercase for easier manipulation
        text_lower = text.lower()

        # Count singular pronouns
        i_count = len(re.findall(r"\bI\b", text))
        me_count = len(re.findall(r"\bme\b", text_lower))
        my_count = len(re.findall(r"\bmy\b", text_lower))
        mine_count = len(re.findall(r"\bmine\b", text_lower))
        myself_count = len(re.findall(r"\bmyself\b", text_lower))

        # Count plural pronouns
        we_count = len(re.findall(r"\bwe\b", text_lower))
        us_count = len(re.findall(r"\bus\b", text_lower))
        our_count = len(re.findall(r"\bour\b", text_lower))
        ours_count = len(re.findall(r"\bours\b", text_lower))
        ourselves_count = len(re.findall(r"\bourselves\b", text_lower))

        # Total word count
        total_word_count = len(re.findall(r"\b\w+\b", text))

        # Calculate summary of counters and narcissism ratio (also avoid division by zero)
        nominator = i_count + me_count + my_count + mine_count + myself_count
        denominator = we_count + us_count + our_count + ours_count + ourselves_count
        if denominator == 0:
            narc_ratio = 0
        else:
            narc_ratio = nominator / denominator
        
        # Store everything into a dictionary data structure, in order to make the conversion to an excel
        # document later much easier.
        results = {
                "file_name": filename,
                "I": i_count,
                "me": me_count,
                "my": my_count,
                "mine": mine_count,
                "myself": myself_count,
                "we": we_count,
                "us": us_count,
                "our": our_count,
                "ours": ours_count,
                "ourselves": ourselves_count,
                "word_count": total_word_count,
                "narc_ratio": narc_ratio
                }
        # Return the dictionary in case it is needed later
        return results
    
    # Handle exceptions of any type so the parent loop doesn't terminate prematurely.
    except FileNotFoundError:
        print(f"File not found: {filename}. Skipping.")

    except Exception as e:
        print(f"Error processing {filename}: {e}")

    finally:
        # Close the file for memory efficiency
        if file is not None:
            file.close()


# extractor.py
This script extracts the required columns from the main Excel dataset. This is done in order to avoid manipulating the main file, and to save time by condensing the data into a smaller Excel spreadsheet.

### Package Imports
- `pandas`: package for reading Excel files and manipulating tabular data efficiently.

In [2]:
# extractor.py

# Packages
import pandas as pd

def extractor(file_path):
    '''
    Function which extracts the required columns from the Master dataset, and
    converts them to a Pandas DataFrame data structure for easier manipulation.

    Input:
        file_path: absolute path of the Master dataset for column extraction.

    Output:
        file: a new Excel spreadsheet with only the important data stored in it.
    '''
    # Store the columns (Excel-style letters) to be extracted in a list.
    # Column A:     id_firm
    # Column B:     firm_name
    # Column M:     coder (also the name of the parent folder which stores the
    #                      letters to shareholders)
    # Column N:     year
    # Column CN:    letter (filename of the letter to shareholders)
    columns_to_extract = ["id_firm", "firm_name", "coder", "year", "letter"]
    
    # Read full dataset
    df_full = pd.read_excel(file_path)

    # Extract important columns and store them into a new DataFrame variable
    df = df_full[columns_to_extract]

    # Save important data to a new Excel spreadsheet
    df.to_excel("extracted_data.xlsx", index=False)
    
    # Print a flag on screen to showcase termination of the extraction operation.
    print("Extraction complete...")


# config.json
This utility file stores 5 Boolean (True or False) variables which make program manipulation much easier to perform. For example, once the required data has been extracted from the `extractor.py` function, then there is no need to extract them again.

# setup.py
In order to correctly use the `config.json` file inside the python script, it is mandatory to load it and extract its data. This is done by the `setup.py` script, which loads the JSON file and returns its data in a Python format for the programs to use later.

### Package Imports
- `json`: library which provides code for JSON file manipulation.

In [3]:
# setup.py

# Packages
import json

def load_config_flags(config_file="config.json"):
    '''
    Python function which reads the config.json file, extracts the configuration
    variables and stores them into Python format for later use.

    Input:
        confit_file: relevant path to the config.json file.

    Output:
        tuple: each item is a config variable.
    '''

    try:
        with open(config_file, "r") as f:
            config = json.load(f)
        
        # Extract the configuration variables.
        extract_data = config.get("extract_data", False)
        analyse_data = config.get("analyse_data", False)
        global_search = config.get("global_search", False)
        local_search = config.get("local_search", False)
        merge_data = config.get("merge_data", False)

        return extract_data, analyse_data, global_search, local_search, merge_data

    except Exception as e:
        print(f"An unexpected error occured: {e}")


# merger.py
This script extract important data from various Excel spreadsheets and merges them correctly and efficiently into a new Excel sheet. This is done with the singular purpose of making the STATA analysis of the new document more efficient, both for the computer and the user.

### Package Imports
- `pandas`: package for reading Excel files and manipulating tabular data efficiently.
- `Path from pathlib`: a path manipulation library to efficiently shift between sub-folders.
- `setup`: the config.json loader script from before.

In [4]:
# merger.py

# Packages
import pandas as pd             
from pathlib import Path        
from setup import *          

def merger():
    '''
    Function which gathers information from different Excel spreadsheets and
    merges them together into a master Excel document, for later use in STATA.

    Output:
        file: Excel spreadsheet with all of the important information for narcissism analysis in STATA, 
              including the name of the file, the occurences of the pronouns, and correct matching with 
              each individual document.
    '''
    # Get configuration variables (ignore config variables which are not of use here)
    _, _, global_search, local_search, _ = load_config_flags()

    # Load excel files
    analysis_df = pd.read_excel("narcissism_analysis.xlsx") # Narcissism data
    master_df = pd.read_excel("extracted_data.xlsx")        # Master Excel sheet

    # Extract the narcissism information for each letter to shareholders. 
    if global_search:
        # Extract narcissism data for each file "Letters/filename_DATE_ID.txt", 
        # and match the columns at the appropriate location in master document.
        analysis_df["letter"] = analysis_df["file_name"].apply(lambda x: Path(x).name)
    elif local_search:
        # Extract narcissism data for each file "Letters/STUDENT/filename_DATE_ID.txt",
        # and match the columns at the appropriate location in master document.
        analysis_df["letter"] = analysis_df["file_name"].apply(
                lambda x: str(x).split("/")[-1].split("\\")[-1]
                )

    # Merge the extracted information into appropriate columns and cells on the 
    # final Excel sheet.
    merged_df = master_df.merge(
            # Remove filename to avoid conflicts
            analysis_df.drop(columns=["file_name"]),
            # Locate the appropriate in-sheet coordinates to store the extracted
            # data
            on="letter",
            how="left"
            )

    # Save updated file
    merged_df.to_excel("merged_narcissism.xlsx", index=False)
    
    # Print a flag on screen to showcase termination of the merging operation.
    print("Merging operation complete...")

# narcissism.py
This script calls all of the above functions at different and appropriate levels, in order to extract data from the master Excel spreadsheet, analyse each letter to shareholder independently, and merge the results into an Excel spreadsheet that is then going to be used in STATA.

### Package Imports
- `pandas`: package for reading Excel files and manipulating tabular data efficiently.
- `Path from pathlib`: a path manipulation library to efficiently shift between sub-folders.
- `setup`: the config.json loader script from before.
- `re`: regular expression package for text cleaning.
- `extractor`: custom extractor function defined above.
- `analyst`: custom analyst function defined above.
- `merger`: custom merger function defined above.
- `setup`: custom setup function defined above.

In [5]:
# narcissism.py

# Packages & External Libraries
import pandas as pd
from extractor import *
from analyst import *  
from merger import *
import re
from pathlib import Path
from setup import *

def main():
    '''
    Main python program which performs analysis for narcissism on each individual letter to shareholders. In order to manipulate the program
    into not performing the same (and completed operations) like extracting data from the Master spreadsheet, go to config.json and change
    the according variables into "true" or "false". DONT USE CAPS, THIS IS NOT PYTHON.
    '''
    # Yield the config variables from config.json
    extract_data, analyse_data, global_search, directory_search, merge_data = load_config_flags()

    # if True, continue with the operation of extracting data from the main Excel spreadsheet.
    if extract_data:
        file_path = "Firm_Data.xlsx"        
        extractor(file_path)

    # if True continue with the operation of analyzing each letter to shareholders for singular/plural pronouns check, in order to 
    # generate the narcissism ratios for each CEO.
    if analyse_data:
        root_folder = Path("Letters")   # Folder which contains the letters
        results = []

        # Main iteration loop, which scans each file and searches for the desired keywords
        if global_search:
            # Perform a global recursive search for all .txt files in the specified root folder. 
            # The goal here is to read all of the letters to shareholders and analyse them for narcissism ratios.
            for txt_file in root_folder.rglob("*.txt"):
                result = analyst(txt_file)
                if result:
                    results.append(result)
        elif directory_search:
            # Perform a localised recursive search (only of parent folder's
            # directories), with the same goal to read all .txt files and yield
            # a narcissism ratio.
            for subfolder in root_folder.iterdir():
                if subfolder.is_dir():
                    for txt_file in subfolder.glob("*.txt"):
                        result = analyst(txt_file)
                        if result:
                            results.append(result)
        else:
            # Return an indication that both search options are turned off.
            # If you get this error check variables global_search & local_search at the config.json
            print("No search method has been defined.")

        # Convert to a pandas DataFrame data structure for easier manipulation
        df = pd.DataFrame(results)

        # Save to Excel (and CSV)
        df.to_excel("narcissism_analysis.xlsx", index = False)
        #df.to_csv("narcissism_analysis.csv", index = False)   # uncomment this line if you also want a .csv file
        
        # Print a flag on screen to showcase termination of narcissism analysis operation.
        print("Analysis complete, see 'narcissism_analysis.xlsx' for results.")

    # if True, continue with the operation of merging the extracted_data.xlsx and narcissism_analysis.xlsx together 
    # into a single Excel spreadsheet.
    if merge_data:
        merger()


if __name__ == "__main__":
    main()

  warn(msg)


   id_firm         firm_name           coder    year             letter
0        1  3D Systems Corp.  Marvin Hanisch  2005.0  letter_2005_1.txt
1        1  3D Systems Corp.  Marvin Hanisch  2006.0  letter_2006_1.txt
2        1  3D Systems Corp.  Marvin Hanisch  2007.0  letter_2007_1.txt
3        1  3D Systems Corp.  Marvin Hanisch  2008.0  letter_2008_1.txt
4        1  3D Systems Corp.  Marvin Hanisch  2009.0  letter_2009_1.txt
Extraction complete...
Analysis complete, see 'narcissism_analysis.xlsx' for results.
Merging operation complete...


# cata.py
This script checks the business part of each 10k filling for each company in the specified root folder. Then, it imports the dictionary in which the clusters of digitalization words are stored, and computed the adt ratio of each 10k filling based on that dictionary. Finally, the results are stored into a separate Excel file, for easier manipulation later in STATA. 

### Package Imports
- `pandas`: package for reading Excel files and manipulating tabular data efficiently.
- `Path from pathlib`: a path manipulation library to efficiently shift between sub-folders.
- `re`: regular expression package for text cleaning.
- `os`: package for path manipulation between each folder.
- `defaultdict from collections`: an enhanched dictionary data structure which is complementary to Python's.

In [6]:
# cata.py

# Packages
import pandas as pd
import os
import re
from collections import defaultdict
from pathlib import Path


# Import the dictionary in which the keywords and clusters are stored.
df_dict = pd.read_excel("full_dictionary.xlsx")
# Initialize a set data structure for each category, i.e. for each keyword cluster.
category_keywords = {}

# Convert each column into a different category, i.e. map the keywords listed under each header to a cluster (a set) in Python.
for col in df_dict.columns:
    words = df_dict[col].dropna().str.lower().tolist()
    # Replace hyphens with underscores for all words in the dictionary.
    category_keywords[col] = [w.replace("-", "_") for w in words]

# Specify folder in which the .txt files are stored.
root_folder = Path("Business")

# Recursively search each subfolber in the root folder for .txt files, and store their paths in a list.
text_files = list(root_folder.rglob("*.txt"))

# Print a flag on screen to showcase the number of files to be analysed. This serves as an indication for the user.
print(f"Found {len(text_files)} text files to process.")

# Initialize an output list
output = []

# Iterate over each relevant path and filename (STUDENT/business_DATA_ID.txt) and analyse them for digitalization ratios.
# Perform each operation in a try-except block, in order to avoid premature termination due to an unexpected error.
for filepath in text_files:
    try:
        with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
            # Read the file for its contents.
            raw_text = f.read()
            # Convert the body of the file to lowercase
            lowercase_text = raw_text.lower()
            # Normalize text in the file, i.e. convert whitespaces and hyphens to underscores. 
            normalized_text = re.sub(r"[\s\-]+", "_", lowercase_text)
        
            # Count the total words in the document
            word_count = len(re.findall(r"\b\w+\b", raw_text))
            # Initialize the total_hits and category_hits variables, both columns in the final Excel file.
            total_hits = 0
            category_hits = {}

        # Iterate over each category and keyword to find matches in the text.
        for category, keywords in category_keywords.items():
            matches = [kw for kw in keywords if kw in normalized_text]
            category_hits[category] = matches
            total_hits += len(matches)

        # Build the output Excel document in an appropriate format for later use in STATA.
        row = {"Filename": filepath.name}
        # Iterate over each category.
        for category in category_keywords:
            # Create a column for each word cluster in the dictionary.
            row[category] = len(category_hits.get(category, []))
        # Add total matches, word count and adt ratio to the final Excel file.
        row["Total_Matches"] = total_hits
        row["Word_Count"] = word_count
        row["Match_Ratio"] = round(total_hits / word_count, 4) if word_count > 0 else 0
        output.append(row)
    
    except Exception as e:
        # Handle any and all errors.
        print(f"Error reading {filepath}: {e}")

# Convert the DataFrame data structure into an .xlsx document.
df_out = pd.DataFrame(output)
df_out.to_excel("cluster_keyword_hits.xlsx", index=False)

# Print a flag on screen to showcase termination of file.
print("Output saved to 'cluster_keyword_hits.xlsx'")

Found 1596 text files to process.
Output saved to 'cluster_keyword_hits.xlsx'


# How to use this code?
- Take each cell of this jupyter notebook, and run it in order to initialize each script in Python's memory.
- In case you don't have some of the libraries, then make sure to download them.
- Make sure to change the **PATHS** to the initial dataset, the dictionary etc. **INSIDE** each relevant Python script. Follow Python's traceback errors.
- Run `narcissism.py` and `cata.py`