In [31]:
import os
import pandas as pd
import re

# 1. Load each .txt file in the test folder into a pandas DataFrame
folder_path = 'TXT'
files = [f for f in os.listdir(folder_path) if f.endswith('.txt')]
data = []

for file in files:
    try:
        with open(os.path.join(folder_path, file), 'r', encoding='utf-8', errors='replace') as f:
            content = f.read().lower()  # Convert content to lowercase

            # Apply the specified transformations
            content = re.sub(r'(\d+)$', r'\1.', content, flags=re.MULTILINE)
            content = re.sub(r'http\S+|www\S+|https\S+', '', content, flags=re.MULTILINE)
            content = re.sub(r'\.{2,}', '.', content)
            content = re.sub(r'\n\s*\n', '\n', content).strip()
            content = re.sub(r'\n(?=[a-z])', ' ', content)

            data.append({'filename': file, 'text': content})
    except Exception as e:
        print(f"Error reading {file}: {e}")

df_files = pd.DataFrame(data)

# 2. Load the taxonomy
def safe_lower(val):
    if isinstance(val, str):
        return val.lower().strip()  # apply strip() here
    return val

df_taxonomy = pd.read_csv('CSI-indexesv3.csv', encoding='ISO-8859-1').applymap(safe_lower)


# 3. Count occurrences of words from taxonomy in text
for column in df_taxonomy.columns:
    words = df_taxonomy[column].dropna().tolist()
    
    # Adjust the pattern to match start or end of string, spaces, or tabs around the word
    pattern = '|'.join([r'(?:^|\s|\t)' + re.escape(word) + r'(?:$|\s|\t)' for word in words])
    
    df_files[column] = df_files['text'].apply(lambda text: len(re.findall(pattern, text)))


# Dropping the text column as it's no longer needed in the output
df_output = df_files.drop(columns='text')

# 4. Save the result to output.csv
df_output.to_csv('output-CSI.csv', index=False)

# 5. Clip count values to 1 for boolean representation
df_output_bool = df_output.copy()
for column in df_taxonomy.columns:
    df_output_bool[column] = df_output_bool[column].clip(upper=1)

# 6. Save the boolean representation to output-CSI_bool.csv
df_output_bool.to_csv('output-CSI_bool.csv', index=False)
