In [28]:
import pandas as pd


In [29]:
df = pd.read_csv('maker_day_shrieyaa_mini_df.csv')

In [30]:
def create_privacy_mask(unmasked_text, masked_text):
    privacy_mask = {}
    masked_pointer = 0
    unmasked_pointer = 0

    while masked_pointer < len(masked_text):
        if masked_text[masked_pointer] == '[':
            # Found the start of a placeholder
            end_placeholder = masked_text.find(']', masked_pointer)
            if end_placeholder == -1:
                break  # Safety check if no closing bracket is found

            # Extract the placeholder, e.g., [USERNAME_1]
            placeholder = masked_text[masked_pointer:end_placeholder + 1]

            # Move past the placeholder
            masked_pointer = end_placeholder + 1

            # Skip over delimiters in masked_text
            while masked_pointer < len(masked_text) and masked_text[masked_pointer] in ' ,.:;!?':
                masked_pointer += 1

            # Identify the start position of the redacted text in unmasked_text
            unmasked_start = unmasked_pointer

            # Look for the next non-redacted segment in masked_text
            next_marker_start = masked_text.find('[', masked_pointer)
            if next_marker_start == -1:
                # If no more markers, use the rest of unmasked_text
                redacted_value = unmasked_text[unmasked_start:].strip()
                privacy_mask[placeholder] = redacted_value
                break  # Exit loop after processing the last redacted value

            # Otherwise, extract the segment to match
            snippet_to_match = masked_text[masked_pointer:next_marker_start].strip()
            unmasked_end = unmasked_text.find(snippet_to_match, unmasked_start)

            if unmasked_end == -1:
                unmasked_end = len(unmasked_text)  # Use the end if snippet not found

            # Extract the redacted word(s)
            redacted_value = unmasked_text[unmasked_start:unmasked_end].strip()
            privacy_mask[placeholder] = redacted_value

            # Update the unmasked pointer
            unmasked_pointer = unmasked_end
        else:
            # Move both pointers forward if not at a placeholder
            masked_pointer += 1
            unmasked_pointer += 1

    return privacy_mask


In [31]:
df['privacy_mask'] = df.apply(lambda row: create_privacy_mask(row['unmasked_text'], row['masked_text']), axis=1)


In [32]:
df.to_csv('hi5.csv', index=False)  # Replace 'updated_file.csv' with the desired output file name


In [33]:
print(df[['unmasked_text', 'masked_text', 'privacy_mask']])


                                        unmasked_text  \
0   Rosemary.King27, do you think your experience ...   
1   Hello, as Direct Security Assistant in Saxony,...   
2   Safety incident reported on Funk Expressway at...   
3   We offer telemedicine to 24408-7904 residents....   
4   Dermatology Clinic Confirmation: Name - Austin...   
..                                                ...   
95  Dear Mr. Powlowski (Jacky), Regarding your rec...   
96  I'm seeking therapy. I live in 41821-3166, my ...   
97  As Cis woman, Mrs. Smitham, you should be awar...   
98  Booking confirmation for the educational leade...   
99  After attending the IP workshop, we've identif...   

                                          masked_text  \
0   [USERNAME_1], do you think your experience as ...   
1   Hello, as [JOBTITLE_1] in [STATE_1], you are i...   
2   Safety incident reported on [STREET_1] at [TIM...   
3   We offer telemedicine to [ZIPCODE_1] residents...   
4   [ORGANIZATION_1] Confirmat