# convert .RTF files to .docx

In [1]:
import os
import pypandoc
import hashlib
import re

In [2]:
def file_hash(file_path, chunk_size=1024):
    """Calculate the MD5 hash of a file to identify duplicates by content."""
    hash_md5 = hashlib.md5()
    with open(file_path, "rb") as f:
        for chunk in iter(lambda: f.read(chunk_size), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

def find_duplicates(folder_path):
    """Identify duplicate .rtf files in a folder based on names and contents."""
    files_seen = {}
    duplicate_files = []

    # Regex pattern to match files with (1), (2), etc., like "file (1).rtf" or "file (2).RTF"
    pattern = re.compile(r"(.*)\(\d+\)\.rtf$", re.IGNORECASE)

    for filename in os.listdir(folder_path):
        if filename.lower().endswith(".rtf"):
            match = pattern.match(filename)
            
            # If the file matches the duplicate naming pattern
            if match:
                duplicate_files.append(folder_path+filename)
                # Extract the base filename, ignoring the (x) suffix and trimming any extra whitespace
                # base_name = match.group(1).strip() + ".rtf"
                # base_name = base_name.lower()  # Ensure consistent lowercase for comparison
                # file_path = os.path.join(folder_path, filename)
                # base_file_path = os.path.join(folder_path, base_name)

                # # Check if the base file (without suffix) has been seen
                # if base_name in files_seen:
                #     base_file_hash = files_seen[base_name]
                #     duplicate_file_hash = file_hash(file_path)
                    
                #     # Compare hashes to confirm they are duplicates
                #     if base_file_hash == duplicate_file_hash:
                #         duplicate_files.append(file_path)
                # else:
                #     # If the original base file exists, calculate and store its hash
                #     if os.path.exists(base_file_path):
                #         files_seen[base_name] = file_hash(base_file_path)

    return duplicate_files

def delete_files(file_list):
    """Delete duplicate files from the system."""
    for file_path in file_list:
        os.remove(file_path)
        #print(f"Deleted: {file_path}")



In [5]:
newspaper = "telegraph"

In [6]:
# delete duplicate files
# Specify the folder path containing the .RTF files
folder_path = rf'articles/{newspaper}/rtf/'

# Find and delete duplicates
duplicates = find_duplicates(folder_path)
delete_files(duplicates)


In [7]:
# Set the input (source) and output (target) folders
input_folder = rf'C:/Users/user/Documents/PhD Workspace/climate-narratives/articles/{newspaper}/rtf/'
output_folder = rf'C:/Users/user/Documents/PhD Workspace/climate-narratives/articles/{newspaper}/txt/'

# Ensure the output folder exists
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Loop through all files in the input folder
filename_list = [filename for filename in os.listdir(input_folder) if filename.endswith(".RTF")]
input_path_list = [os.path.join(input_folder, filename) for filename in filename_list]
output_path_list = [os.path.join(output_folder, filename.replace(".RTF", ".txt")) for filename in filename_list]

for filename, input_path, output_path in zip(filename_list, input_path_list, output_path_list):
    if filename.replace(".RTF", ".txt") in os.listdir(output_folder):
        continue
    try:
        pypandoc.convert_file(input_path, 'plain', format='rtf', outputfile=output_path)
    except:
        continue

# for filename in os.listdir(input_folder):
#     if filename.endswith(".RTF"):
#         input_path = os.path.join(input_folder, filename)
#         output_path = os.path.join(output_folder, filename.replace(".RTF", ".txt"))
#         #print("file found: ", input_path)
#         # Convert the RTF file to TXT
#         pypandoc.convert_file(input_path, 'plain', format='rtf', outputfile=output_path)
#         #print(f"Converted {filename} to .txt")
