In [7]:
from simple_salesforce import Salesforce, SalesforceMalformedRequest
import pandas as pd
from dotenv import load_dotenv
from os import getenv
import logging

# --- ENVIRONMENT SETUP ---
load_dotenv()
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# --- CONFIGURATION ---
SALESFORCE_USERNAME = getenv("SF_USERNAME")
SALESFORCE_PASSWORD = getenv("SF_PASSWORD")
SALESFORCE_SECURITY_TOKEN = getenv("SF_TOKEN")
CSV_FILE = "/Users/eliassantibanez/Documents/eli_projects/doug/Faker/fake_properties.csv"

# --- AUTHENTICATION ---
def connect_to_salesforce():
    try:
        sf = Salesforce(
            username=SALESFORCE_USERNAME,
            password=SALESFORCE_PASSWORD,
            security_token=SALESFORCE_SECURITY_TOKEN,
            domain="test"  # Change to "login" if production
        )
        logger.info("✅ Connected to Salesforce")
        return sf
    except Exception as e:
        logger.error(f"❌ Failed to connect to Salesforce: {e}")
        raise

sf = connect_to_salesforce()

# Load actual field list
fields_df = pd.read_csv("/Users/eliassantibanez/Documents/eli_projects/doug/Field and Values/exports/Data_Management__c_fields.csv")
fields = ', '.join(fields_df['Field Name'].tolist())

# Query Salesforce for Data Management records
soql = f"SELECT {fields} FROM Data_Management__c ORDER BY LastModifiedDate DESC LIMIT 100"
records = sf.query_all(soql)['records']

df = pd.DataFrame(records).drop(columns='attributes')
# df.to_csv("data_management_export.csv", index=False)


from random import random

def fuzzy_street_number(x):
    if pd.isna(x): return x
    s = str(x)
    if random() < 0.15: s += "A"
    elif random() < 0.15: s += "-1"
    return s

def fuzzy_street_number_numeric(x):
    if pd.isna(x): return x
    try:
        n = float(x)
    except:  # if a stray string sneaks in, just return it
        return x
    r = rng.random()
    if r < 0.15: return n + 1
    if r < 0.30: return n - 1
    return n


def fuzzy_street_name(x):
    if pd.isna(x): return x
    s = str(x)
    if random() < 0.25: s = s.replace("Street", "St.").replace("Avenue", "Ave.").replace("Road","Rd.")
    if random() < 0.15: s = s.replace("e","3").replace("o","0")
    if random() < 0.15: s = s.upper()
    if random() < 0.12: s += f" Unit {int(1+random()*20)}"
    return s

def fuzzy_city(x):
    if pd.isna(x): return x
    s = str(x)
    if random() < 0.2: s = s + " City"
    if random() < 0.15: s = s.replace("o","0")
    if random() < 0.1: s = s.title()
    return s

def fuzzy_zip(x):
    if pd.isna(x): return x
    s = str(x)
    if random() < 0.3 and len(s) >= 5: s = s[:5]
    if random() < 0.2: s = s + "-1234"
    return s

def fuzzy_parcel(pid):
    if pd.isna(pid): return pid
    s = str(pid)
    if random() < 0.25: s = s.replace("-", "")
    if random() < 0.15: s = s + "-01"
    if random() < 0.1:  s = s.upper()
    return s

def fuzzy_legal(desc):
    if pd.isna(desc): return desc
    s = str(desc)
    if random() < 0.25: s = s.upper()
    if random() < 0.15: s = s.replace("Lot", "L0t").replace("Block","Blk")
    if random() < 0.1:  s = s.replace(",", "")
    return s

df_mut = df.copy()

import numpy as np



INFO:__main__:✅ Connected to Salesforce


In [8]:

# --- CATEGORY MIX (must sum to 1.0) ---
CAT_PCT = {
    "Match":        0.60,
    "Needs Review": 0.25,
    "Duplicate":    0.15,
}

rng = np.random.default_rng(42)  # reproducible
labels = np.array(list(CAT_PCT.keys()))
probs  = np.array(list(CAT_PCT.values()), dtype=float)

# Write the category to a field you already have in SF
df_mut["Match_Status__c"] = rng.choice(labels, size=len(df_mut), p=probs)



In [9]:

# --- CONTROL MUTATION RATE ---
MUTATE_PERCENT = 40  # set to 100 for all, 0 for none, etc.

# Create a mask where True means "mutate this row"
if MUTATE_PERCENT >= 100:
    mask = pd.Series([True] * len(df_mut))
elif MUTATE_PERCENT <= 0:
    mask = pd.Series([False] * len(df_mut))
else:
    mask = pd.Series(range(len(df_mut))) % int(100 / MUTATE_PERCENT) == 0

# Apply fuzzy functions only to rows where mask == True
cols_to_mutate = [
    ("Street_Number__c", fuzzy_street_number_numeric),
    ("Street_Name__c",   fuzzy_street_name),
    ("City__c",          fuzzy_city),
    ("Postal_Code__c",   fuzzy_zip),
    ("Parcel_ID__c",     fuzzy_parcel),
    ("Legal_Description__c", fuzzy_legal),
]
for col, fn in cols_to_mutate:
    if col in df_mut.columns:
        df_mut.loc[mask, col] = df_mut.loc[mask, col].map(fn)

logger.info(f"Mutations applied to ~{MUTATE_PERCENT}% of rows ({mask.sum()} of {len(mask)})")



INFO:__main__:Mutations applied to ~40% of rows (50 of 100)


In [10]:
df_mut = df_mut.drop(columns=["IsDeleted","Name","Created_By_Script__c","Id","CreatedById","LastModifiedDate","LastModifiedById","SystemModstamp","LastActivityDate", "LastViewedDate", "CreatedDate","LastReferencedDate","Processed__c"])

In [11]:
df_mut

Unnamed: 0,OwnerId,Parcel_ID__c,Property_Address__c,Legal_Description__c,Auction_Date__c,Bedrooms__c,City__c,County__c,Deceased__c,Estimated_Equity__c,...,Mortgage_Foreclosure__c,Mailing_Address_Owner_1__c,Mailing_Address_1__c,Mailing_Address_2__c,Mailing_Address_3__c,Mailing_Address_4__c,Years_Owned__c,Equity_Percentage__c,Race__c,Last_Updated_By_Script__c
0,005Uy00000ZbNcrIAF,4790364F-01,"9226 Lisa Radial Suite 535, New Jasonfort, WI ...",Drop defense thing magazine. trial,2025-08-13,1.7,diazhaven,Bexar County,False,386195.82,...,False,True,"7587 Anthony Spurs, South Andreaport, GA 26036","4848 James Common Apt. 205, Glennview, VT 91355","44548 Morales Radial Apt. 384, Castilloview, K...","1590 Hamilton Cliffs, Burtonshire, VT 75741",,,Black or African American,
1,005Uy00000ZbNcrIAF,6DAA4704,"33446 Gallagher Village, Lake Jerry, PW 52137",Her the name.,2025-07-18,2.0,EAST RYANSHIRE,Harris County,False,344184.73,...,False,True,"00357 Brian Junctions, New Michael, AL 62839","Unit 5881 Box 6565, DPO AE 08305","979 Ford Way, Port Cody, CA 62522","4766 Coleman Plains, New Ethan, WA 09676",,,Black or African American,
2,005Uy00000ZbNcrIAF,8de4a792,"80470 Miller Roads Apt. 368, Benjaminview, VA ...",MOUTH ALREADY FORM INCLUDE.,2025-08-06,1.1,williamschester City,Bexar County,False,399362.73,...,False,False,"76589 Eileen Extensions Suite 795, Paulport, M...","67655 Linda Square, North Veronicamouth, OH 34678","881 Jeremy Park Apt. 920, West Michael, GA 13058","0936 Jones Crossing Suite 110, New Christine, ...",,,Black or African American,
3,005Uy00000ZbNcrIAF,8A6717CF-01,"08411 Alexandra Mall, East Samuelview, TN 10076",Little adult term eye knowledge pretty busines...,2025-07-16,2.7,Andrewtownville,Harris County,False,91447.23,...,False,True,"9038 Nicole Stream, Michaelville, WV 27088","5066 Klein Pines, Williamtown, MS 02980","273 Rodgers Burgs, South Paulberg, MT 56410","2424 Dawson Causeway, Lake Lisafurt, CT 95798",,,Asian,
4,005Uy00000ZbNcrIAF,42770ed1,"342 Baker Spring Apt. 661, Davidville, OR 92898",service source just away boy notice.,2025-08-14,2.6,Matthewstad,Maricopa County,False,310979.59,...,True,True,"690 Fernandez Spur Apt. 975, West William, AR ...","256 Melinda Loop Suite 940, Davisview, IA 30295","13073 Mooney Club, North Danielle, NM 92141","4743 Holly Mountain, New Juan, FL 68656",,,Asian,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,005Uy00000ZbNcrIAF,7B7DFA5B,"USCGC Murray, FPO AP 41420",Economy street big.,2025-07-17,4.8,Lauraburgh,King County,False,292577.08,...,False,True,"16791 Spears Ways Apt. 874, Davishaven, NJ 37541","634 Charlene Spur, North Andrewtown, UT 77298","05693 Lee Trafficway, Morrisfurt, MT 10915","862 Vasquez Trace, Jamesshire, VA 63497",,,Asian,
96,005Uy00000ZbNcrIAF,A9FB6BA5,"4234 David Mission, North John, MI 53773",Old tell into those.,2025-07-15,4.9,Port Judithburgh,Bexar County,False,264028.00,...,False,False,"9471 Thomas Meadows, Johntown, HI 05583","724 Hayes Shoal, Jessicafurt, RI 96802","591 Joseph Place, Danieltown, WI 01498","026 Martin Prairie Apt. 423, Lake Cassandra, D...",,,American Indian or Alaska Native,
97,005Uy00000ZbNcrIAF,4844DCD9,"955 David Union Apt. 801, Stevenmouth, CA 11609",Debate notice six leave keep.,2025-08-13,2.5,S0uth Teresashire,Orange County,False,231439.77,...,False,True,"6934 Young Turnpike Suite 505, South Tanya, NE...","650 Anthony Camp, Davidshire, WY 75988","796 Smith Mountains Apt. 215, Feliciaborough, ...","7951 Ho Fork Apt. 879, South Jeffrey, GU 32594",,,Black or African American,
98,005Uy00000ZbNcrIAF,58553CC6,"88670 Scott Plain, Lake Larry, CO 39551",Record water campaign.,2025-07-25,2.0,North Carolhaven,Tarrant County,False,350280.88,...,True,True,"Unit 6164 Box 2033, DPO AP 62541","PSC 0983, Box 8372, APO AA 84255","0402 Brenda Meadow Apt. 213, Derekfurt, ME 42678","53536 Amanda Manor, New Patricia, IA 86381",,,Asian,


In [12]:
import json
import numpy as np
# bulk_insert_data_management.py
from simple_salesforce import Salesforce
import pandas as pd
import os

# --- CONFIG (env vars expected) ---
SF_USERNAME = os.getenv("SF_USERNAME")
SF_PASSWORD = os.getenv("SF_PASSWORD")
SF_TOKEN    = os.getenv("SF_TOKEN")
SF_DOMAIN   = os.getenv("SF_DOMAIN", "test")  # "test" for sandbox, "login" for prod

CSV_PATH    = "/path/to/your/mutated_export.csv"  # <- point to your mutated CSV

# --- CONNECT ---
sf = Salesforce(
    username=SF_USERNAME,
    password=SF_PASSWORD,
    security_token=SF_TOKEN,
    domain=SF_DOMAIN
)
print("✅ Connected")

# --- LOAD DATA ---
df = df_mut.copy()

# # --- (Optional) Keep only fields that are actually creatable in SF ---
# #    This guards against INVALID_FIELD_FOR_INSERT_UPDATE errors.
# dm_desc = sf.Data_Management__c.describe()
# creatable = {f["name"] for f in dm_desc["fields"] if f.get("createable")}
# df = df[[c for c in df.columns if c in creatable]]  # drop anything not creatable

# # --- Drop read-only/system-ish columns if they slipped in ---
# drop_cols = {
#     "Id","OwnerId","IsDeleted","CreatedDate","CreatedById","LastModifiedDate",
#     "LastModifiedById","SystemModstamp","LastActivityDate","LastViewedDate",
#     "LastReferencedDate"
# }
# df = df[[c for c in df.columns if c not in drop_cols]]

# --- Convert NaN -> None so they serialize cleanly ---
# records = pd.DataFrame(df).where(pd.notnull(pd.DataFrame(df)), None).to_dict(orient="records")
records = json.loads(pd.DataFrame(df).replace([np.inf, -np.inf], np.nan)
                     .to_json(orient="records", date_format="iso"))

# --- BULK INSERT (creates new records) ---
#     NOTE: This uses simple_salesforce bulk helper and returns result per row.
#     You can tweak batch_size (max 10,000); 5k is a safe default.
results = sf.bulk.Data_Management__c.insert(records, batch_size=5000)

# --- SUMMARY ---
ok = [r for r in results if r.get("success")]
bad = [r for r in results if not r.get("success")]

print(f"✅ Inserted: {len(ok)}")
print(f"❌ Failed:   {len(bad)}")

# Show first few failures (id will be present for successes)
for i, row in enumerate(bad[:10], 1):
    print(f"[{i}] errors={row.get('errors')}")

# If you want the new Ids for the successful rows:
new_ids = [r["id"] for r in ok if r.get("id")]
print(f"Sample new Ids: {new_ids[:5]}")


✅ Connected
✅ Inserted: 100
❌ Failed:   0
Sample new Ids: ['a1VcW000004Tkd3UAC', 'a1VcW000004Tkd4UAC', 'a1VcW000004Tkd5UAC', 'a1VcW000004Tkd6UAC', 'a1VcW000004Tkd7UAC']
