In [1]:
import pandas as pd
import os
import re
import chardet

# Load the keywords
keywords_df = pd.read_csv('keywords.csv', encoding='latin1')
keywords = keywords_df['Keyword'].tolist()
#lowercase keywords
keywords = [keyword.lower() for keyword in keywords]

# Get a list of all text files in the TXT directory
txt_files = [f for f in os.listdir('TXT') if f.endswith('.txt')]

# Prepare the data for the output dataframe
data = {
    'Filename': [],
    'KeywordPresent': [],
    'FoundKeywords': []
}

# Go through all txt files
for txt_file in txt_files:
    with open(os.path.join('TXT_no_names', txt_file), 'rb') as file:
        # Detect file encoding
        result = chardet.detect(file.read())
    
    # Re-open the file with the correct encoding
    try:
        with open(os.path.join('TXT_no_names', txt_file), 'r', encoding=result['encoding']) as file:
            # Read file content
            content = file.read()
    except UnicodeDecodeError:
        try:
            with open(os.path.join('TXT_no_names', txt_file), 'r', encoding='utf-8') as file:
                content = file.read()
        except UnicodeDecodeError:
            with open(os.path.join('TXT_no_names', txt_file), 'r', encoding='latin1') as file:
                content = file.read()

    # Apply cleaning operations
    content = re.sub(r'(\d+)$', r'\1.', content, flags=re.MULTILINE)
    content = re.sub(r'http\S+|www\S+|https\S+', '', content, flags=re.MULTILINE)
    content = re.sub(r'\.{2,}', '.', content)
    content = re.sub(r'\n\s*\n', '\n', content).strip()
    content = re.sub(r'\n(?=[a-z])', ' ', content)
    content = content.lower()

    # Find keywords in content
    found_keywords = [keyword for keyword in keywords if re.search(r"\b%s\b" % keyword.replace(".", r"\.").replace("*", "\w*"), content)]
    
    # Append results
    data['Filename'].append(txt_file)
    data['KeywordPresent'].append(bool(found_keywords))  # Convert to bool
    data['FoundKeywords'].append(', '.join(found_keywords))

# Create output dataframe and save as CSV
output_df = pd.DataFrame(data)
output_df.to_csv('output.csv', index=False)
