In [1]:
import pandas as pd
import spacy
import re
from nltk.corpus import stopwords

In [None]:
test_data = pd.read_csv("your_test_data.csv")

In [3]:
STOPWORDS = set(stopwords.words('english'))
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower() 
    text = REPLACE_BY_SPACE_RE.sub(' ', text) 
    text = BAD_SYMBOLS_RE.sub('', text)     
    text = text.replace('x', '')
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text


#training data
test_data = test_data.reset_index(drop=True)
test_data['text'] = test_data['text'].apply(clean_text)

In [5]:
# Load spaCy's English language model
nlp = spacy.load('en_core_web_sm')

def detect_and_mask_demographics(text):
    # Regex to find age patterns
    age_pattern = r'\b\d{1,2}\s*-?\s*years?\s*-?\s*old\b'
    text = re.sub(age_pattern, '[AGE]', text)

    gender_pattern = r'\b(man|woman|male|female)\b'
    text = re.sub(gender_pattern, '[GENDER]', text)
    return text

def mask_demographic_entities(text):
    # First, use regex to mask ages
    text = detect_and_mask_demographics(text)
    # Then use spaCy for other entity types
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ in ["AGE", "NORP", "GPE", "PERSON", "SEXUALITY"]:
            text = text.replace(ent.text, f'[{ent.label_}]')
    return text

# Apply the function to mask demographics in the dataframe
test_data['masked_text'] = test_data['text'].apply(mask_demographic_entities)

def add_demographic_info(text, category, value):
    """
    Inserts demographic information into the text more naturally.

    :param text: original text string
    :param category: demographic category (e.g., 'gender')
    :param value: demographic value (e.g., 'female')
    """
    if category == 'religion':
        return f"As a {value}, {text}"
    elif category == 'race':
        return f"As a {value} individual, {text}"
    elif category == 'gender':
        return f"As a {value}, {text}"
    elif category == 'nationality':
        return f"As an individual from {value}, {text}"
    elif category == 'sexuality':
        return f"As a {value}, {text}"
    elif category == 'age':
        return f"As a {value}, {text}"
    elif category == 'combination':
        return f"As a {value}, {text}"
    
# Demographic categories and their possible values
demographics = {
    'religion': ['Christian', 'Muslim', 'Hindu', 'Buddhist', 'Jewish'],
    'gender': ['male', 'female'],
    'race': ['White', 'Black', 'Asian', 'Native American', 'Native Hawaiian or Other Pacific Islander'],
    'nationality': ['USA', 'Canada', 'Mexico', 'Brazil', 'UK', 'Germany', 'Russia', 'Nigeria', 'South Africa', 'China', 'India', 'Japan', 'Saudi Arabia', 'Israel', 'Australia'],
    'sexuality': ['heterosexual', 'homosexual', 'bisexual', 'pansexual', 'asexual'],
    'age': ['youth', 'young adult', 'middle-aged', 'senior'],
    'combination': [
    'Black female youth',
    'middle-aged White male',
    'young adult Hispanic homosexual',
    'Native American asexual',
    'Christian Nigerian female',
    'pansexual Australian youth',
    'Jewish Israeli middle-aged',
    'Black British bisexual',
    'Muslim Saudi Arabian male',
    'Asian American female',
    'Buddhist Japanese senior',
    'Christian Canadian female',
    'heterosexual Russian middle-aged',
    'Native Hawaiian Pacific or Other Pacific Islander youth',
    'asexual Chinese young adult',
    'homosexual Black female',
    'bisexual Brazilian middle-aged',
    'Hindu Indian female',
    'pansexual German youth',
    'Jewish American middle-aged',
    'homosexual Asian male',
    'Buddhist Chinese female',
    'heterosexual White senior',
    'asexual Japanese young adult']
}

# Expand the DataFrame with demographic variations, SAD as an example here
def expand_with_demographics(df, demographics):
    new_rows = []
    for index, row in df.iterrows():
        for category, values in demographics.items():
            for value in values:
                modified_text = add_demographic_info(row['masked_text'], category, value)
                new_rows.append({'text': modified_text, 'Financial Problem': row['Financial Problem'], 'Other': row['Other'], 'Everyday Decision Making': row['Everyday Decision Making'],
       'Emotional Turmoil': row['Emotional Turmoil'], 'School': row['School'], 'Family Issues': row['Family Issues'], 'Social Relationships': row['Social Relationships'],
       'Work': row['Work'], 'Health, Fatigue, or Physical Pain': row['Health, Fatigue, or Physical Pain'], 'category': category, 'demographic': value})
    return pd.DataFrame(new_rows)


# Apply the function to create a new DataFrame with all variations
test_data = expand_with_demographics(test_data, demographics)
test_data = test_data[['text', 'category', 'demographic', 'Financial Problem', 'Other', 'Everyday Decision Making',
       'Emotional Turmoil', 'School', 'Family Issues', 'Social Relationships',
       'Work', 'Health, Fatigue, or Physical Pain']]