In [6]:
# ============================================================
# 1. APPLICANT INPUTS — Synthetic Data Generation (Enhanced)
# ============================================================

import pandas as pd
import numpy as np
np.random.seed(42)

# ------------------------------------------------------------
#  Basic setup
# ------------------------------------------------------------
n = 500
countries = ["Syria","Afghanistan","Sudan","Myanmar","Eritrea","Venezuela","Iraq","Somalia"]
genders = ["Male","Female","Non-binary"]
education_levels = ["None","Primary","Secondary","Tertiary"]
language_levels = ["None","Basic","Intermediate","Advanced","Fluent"]
persecution_grounds = ["race","religion","nationality","political_opinion","social_group"]
persecution_types = ["violence","detention","threats","sexual_violence","discrimination"]

# ------------------------------------------------------------
#  Applicant demographics and legal claim basics
# ------------------------------------------------------------
df = pd.DataFrame({
    "id": range(1, n+1),
    "country_of_origin": np.random.choice(countries, n),
    "gender": np.random.choice(genders, n),
    "age": np.random.randint(18, 65, n),
    "education_level": np.random.choice(education_levels, n, p=[0.1,0.3,0.4,0.2]),
    "language_proficiency": np.random.choice(language_levels, n, p=[0.05,0.25,0.4,0.2,0.1]),
    "family_size": np.random.randint(1, 7, n),
    "prior_camp_years": np.random.randint(0, 10, n),
    "persecution_ground": np.random.choice(persecution_grounds, n),
    "persecution_type": np.random.choice(persecution_types, n),
})

df["nexus_established"] = np.random.choice([True, False], n, p=[0.7, 0.3])
df["state_protection_score"] = np.clip(np.random.normal(0.3, 0.15, n), 0, 1)
df["internal_relocation_possible"] = np.random.choice([True, False], n, p=[0.4, 0.6])

# ------------------------------------------------------------
#  Credibility score — depends on language & education
# ------------------------------------------------------------
base_cred = np.random.normal(0.7, 0.15, n)
language_map = {"None": -0.2, "Basic": -0.1, "Intermediate": 0, "Advanced": +0.05, "Fluent": +0.1}
edu_map = {"None": -0.1, "Primary": 0, "Secondary": +0.05, "Tertiary": +0.1}

lang_effect = df["language_proficiency"].map(language_map)
edu_effect = df["education_level"].map(edu_map)

df["credibility_score"] = np.clip(base_cred + lang_effect + edu_effect, 0, 1)

# ------------------------------------------------------------
#  Risk score — depends on country, gender, persecution type
# ------------------------------------------------------------
risk_means = {
    "Syria": 0.85, "Afghanistan": 0.8, "Sudan": 0.75, "Myanmar": 0.7,
    "Eritrea": 0.7, "Venezuela": 0.6, "Iraq": 0.65, "Somalia": 0.8
}
ptype_map = {
    "violence": +0.1, "detention": +0.05, "threats": 0,
    "sexual_violence": +0.15, "discrimination": -0.05
}
gender_map = {"Male": 0, "Female": +0.1, "Non-binary": +0.08}

base_risk = df["country_of_origin"].map(risk_means)
ptype_effect = df["persecution_type"].map(ptype_map)
gender_effect = df["gender"].map(gender_map)

df["risk_score"] = np.clip(base_risk + ptype_effect + gender_effect + np.random.normal(0, 0.05, n), 0, 1)

# ------------------------------------------------------------
#  Integration score — success potential if resettled
# ------------------------------------------------------------
df["integration_score"] = np.clip(
    0.4 * df["credibility_score"]
    + 0.2 * (1 - abs(df["age"] - 35) / 35)
    + 0.4 * np.random.random(n),
    0, 1
)

df.head()


Unnamed: 0,id,country_of_origin,gender,age,education_level,language_proficiency,family_size,prior_camp_years,persecution_ground,persecution_type,nexus_established,state_protection_score,internal_relocation_possible,credibility_score,risk_score,integration_score
0,1,Iraq,Non-binary,27,Primary,Basic,4,2,race,violence,True,0.547422,True,0.846507,0.721516,0.590138
1,2,Myanmar,Male,34,Tertiary,Basic,4,1,social_group,discrimination,True,0.460592,False,0.754097,0.590291,0.793825
2,3,Eritrea,Male,37,Primary,Intermediate,2,4,political_opinion,sexual_violence,False,0.19056,False,0.570476,0.802061,0.484056
3,4,Iraq,Non-binary,41,Tertiary,Basic,6,4,nationality,threats,True,0.354213,False,0.695319,0.662375,0.519816
4,5,Sudan,Male,22,Secondary,Basic,3,4,nationality,discrimination,True,0.106029,True,0.652703,0.620821,0.57108


In [7]:
# ============================================================
# 2. SYSTEM PROCESS — Simulate AI & Human Decision Pipeline
# ============================================================

import numpy as np

# -------------------------------
# AI Decision Logic
# -------------------------------
# The AI makes a decision based on risk_score and credibility_score thresholds.
# If both are relatively high → approve; otherwise → deny.

df["AI_decision"] = np.where(
    (df["risk_score"] > 0.5) & (df["credibility_score"] > 0.6),
    "approve",
    "deny"
)

# -------------------------------
# Human-in-the-loop Overrides
# -------------------------------
# 10% of all cases get reviewed by a human officer.
# Among those, about half result in an override (flipping the AI decision).

n = len(df)
df["human_override"] = False
reviewed_cases = np.random.choice(df.index, size=int(0.10 * n), replace=False)
df.loc[reviewed_cases, "human_override"] = True

# When an override happens, flip the decision
flip_cases = np.random.choice(df.index, size=int(0.05 * n), replace=False)
df.loc[flip_cases, "AI_decision"] = np.where(
    df.loc[flip_cases, "AI_decision"] == "approve", "deny", "approve"
)

# -------------------------------
# Processing Time Simulation
# -------------------------------
# Faster for automated approvals, slower if human override involved.
base_time = np.random.randint(30, 120, n)  # baseline (days)
df["processing_time_days"] = base_time + df["human_override"].apply(lambda x: np.random.randint(20, 60) if x else 0)

# Preview to check
df[["id", "AI_decision", "human_override", "processing_time_days"]].head(10)


Unnamed: 0,id,AI_decision,human_override,processing_time_days
0,1,approve,False,108
1,2,approve,True,113
2,3,deny,False,33
3,4,approve,False,63
4,5,approve,True,133
5,6,approve,True,99
6,7,deny,False,98
7,8,approve,False,67
8,9,approve,False,55
9,10,approve,False,30


In [4]:
# ============================================================
# 3. OUTCOMES — Generate Final Decisions & Appeal Outcomes
# ============================================================

import numpy as np

# -------------------------------
# Final Decision (post human review)
# -------------------------------
# Start from the AI decision but account for human_override corrections
df["final_decision"] = df["AI_decision"]

# If a case was reviewed by a human, small chance of change (flip)
for i in df.index:
    if df.loc[i, "human_override"]:
        if np.random.rand() < 0.5:  # 50% of overrides flip the outcome
            df.loc[i, "final_decision"] = (
                "approve" if df.loc[i, "AI_decision"] == "deny" else "deny"
            )

# -------------------------------
# Appeal Process
# -------------------------------
# About 30% of denied cases go to appeal.
appeal_prob = 0.30
df["appealed"] = np.where(
    (df["final_decision"] == "deny") & (np.random.rand(len(df)) < appeal_prob),
    True,
    False
)

# Appeal outcome probabilities:
#  - 60% of appeals are upheld (denial confirmed)
#  - 40% are overturned (refugee recognized)
appeal_outcomes = []
for appealed, decision in zip(df["appealed"], df["final_decision"]):
    if not appealed:
        appeal_outcomes.append("N/A")
    else:
        if np.random.rand() < 0.4:
            # Overturned: grant refugee status
            appeal_outcomes.append("overturned")
        else:
            appeal_outcomes.append("upheld")
df["appeal_outcome"] = appeal_outcomes

# -------------------------------
# Optional: Bias flag for audits
# -------------------------------
# Randomly tag 10% of cases as "review for bias"
bias_labels = ["none", "moderate", "severe"]
bias_probs = [0.7, 0.2, 0.1]
df["bias_flag"] = np.random.choice(bias_labels, len(df), p=bias_probs)

# Preview
df[["id", "final_decision", "appealed", "appeal_outcome", "bias_flag"]].head(10)


Unnamed: 0,id,final_decision,appealed,appeal_outcome,bias_flag
0,1,approve,False,,none
1,2,deny,False,,none
2,3,deny,False,,none
3,4,approve,False,,severe
4,5,deny,True,upheld,none
5,6,approve,False,,moderate
6,7,deny,True,overturned,none
7,8,approve,False,,none
8,9,approve,False,,severe
9,10,deny,False,,none


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 19 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            500 non-null    int64  
 1   country_of_origin             500 non-null    object 
 2   gender                        500 non-null    object 
 3   age                           500 non-null    int64  
 4   education_level               500 non-null    object 
 5   language_proficiency          500 non-null    object 
 6   family_size                   500 non-null    int64  
 7   prior_camp_years              500 non-null    int64  
 8   persecution_ground            500 non-null    object 
 9   persecution_type              500 non-null    object 
 10  nexus_established             500 non-null    bool   
 11  state_protection_score        500 non-null    float64
 12  internal_relocation_possible  500 non-null    bool   
 13  credi

In [9]:
# Save inside Colab
df.to_csv("synthetic_RSD_full_dataset.csv", index=False)

In [10]:
from google.colab import files
files.download("synthetic_RSD_full_dataset.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>