<a href="https://colab.research.google.com/github/Chuzodepollo/Componente1_EcheverriaAlejandro/blob/master/Act1_Unid2_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from scipy.special import expit

# ----- Config -----
np.random.seed(42)
N = 1200

def random_choice(options, p=None, size=N):
    return np.random.choice(options, size=size, p=p)

def clip01(x):
    return np.clip(x, 0, 1)

# ----- Features -----
student_id = np.arange(1, N+1)
age = np.random.normal(loc=19.5, scale=2.2, size=N).round()
gender = random_choice(["Male", "Female", "Non-binary"], p=[0.48, 0.50, 0.02])
origin_region = random_choice(["Urban", "Rural", "Semi-urban"], p=[0.62, 0.25, 0.13])

hs_gpa = np.clip(np.random.normal(3.4, 0.4, N), 0, 4)
admission_exam = np.clip(np.random.normal(68, 12, N), 0, 100)
sem1_gpa = np.clip(np.random.normal(3.1, 0.6, N), 0, 4)

socioeconomic_level = random_choice(["Low", "Medium", "High"], p=[0.35, 0.5, 0.15])
scholarship = random_choice(["Yes", "No"], p=[0.38, 0.62])
loan = random_choice(["Yes", "No"], p=[0.28, 0.72])
aid_amount = np.where(scholarship=="Yes", np.random.gamma(shape=2.0, scale=900, size=N), 0).round(2)

attendance_rate = clip01(np.random.beta(a=4, b=1.8, size=N))
hours_studied_per_week = np.round(np.random.normal(12, 5, N).clip(0, 45), 1)

df = pd.DataFrame({
    "student_id": student_id,
    "age": age.astype(int),
    "gender": gender,
    "origin_region": origin_region,
    "hs_gpa": np.round(hs_gpa, 2),
    "admission_exam": np.round(admission_exam, 1),
    "sem1_gpa": np.round(sem1_gpa, 2),
    "socioeconomic_level": socioeconomic_level,
    "scholarship": scholarship,
    "loan": loan,
    "aid_amount": aid_amount,
    "attendance_rate": np.round(attendance_rate, 3),
    "hours_studied_per_week": hours_studied_per_week
})

# ----- Dropout via logistic prob -----
ses_map = {"Low": -0.9, "Medium": -0.4, "High": 0.0}
gender_map = {"Male": 0.05, "Female": -0.05, "Non-binary": 0.10}
region_map = {"Rural": 0.2, "Semi-urban": 0.1, "Urban": 0.0}
sch_map = {"Yes": -0.25, "No": 0.0}
loan_map = {"Yes": -0.05, "No": 0.0}

z = (
    -1.0
    - 0.9 * (df["hs_gpa"] - 2.5)
    - 1.4 * (df["sem1_gpa"] - 2.4)
    - 2.0 * (df["attendance_rate"] - 0.7)
    - 0.02 * (df["hours_studied_per_week"] - 10)
    + df["socioeconomic_level"].map(ses_map).fillna(0)
    + df["gender"].map(gender_map).fillna(0)
    + df["origin_region"].map(region_map).fillna(0)
    + df["scholarship"].map(sch_map).fillna(0)
    + df["loan"].map(loan_map).fillna(0)
)
p_dropout = expit(z)
df["dropout"] = (np.random.rand(N) < p_dropout).astype(int)

# ----- Missing values -----
rng = np.random.default_rng(123)
def inject_missing(series, frac):
    mask = rng.random(len(series)) < frac
    s = series.copy()
    s[mask] = np.nan
    return s

df["hs_gpa"] = inject_missing(df["hs_gpa"], 0.06)
df["admission_exam"] = inject_missing(df["admission_exam"], 0.05)
df["sem1_gpa"] = inject_missing(df["sem1_gpa"], 0.07)
df["attendance_rate"] = inject_missing(df["attendance_rate"], 0.04)
df["hours_studied_per_week"] = inject_missing(df["hours_studied_per_week"], 0.03)
df["socioeconomic_level"] = inject_missing(df["socioeconomic_level"], 0.02)

# ----- Outliers -----
n_out = 20
idx = np.random.choice(df.index, size=n_out, replace=False)
df.loc[idx[:5], "age"] = df.loc[idx[:5], "age"] + np.random.randint(10, 25, size=5)
df.loc[idx[5:10], "admission_exam"] = df.loc[idx[5:10], "admission_exam"] + np.random.randint(50, 120, size=5)
df.loc[idx[10:15], "hours_studied_per_week"] = -np.abs(np.random.normal(10, 3, 5))
df.loc[idx[15:], "sem1_gpa"] = 4 + np.random.uniform(0.1, 1.5, size=5)

# ----- Save -----
out_dir = Path(".")
csv_path = out_dir / "student_dropout_synthetic.csv"
df.to_csv(csv_path, index=False)

readme = f"""
# Synthetic Student Dropout Dataset

This dataset simulates a university dropout scenario for a supervised learning task (binary classification).
**Rows:** {N} — **Target:** `dropout` (1=student drops out, 0=remains).

## Variables
- `student_id` (int): unique identifier.
- `age` (int): years; mostly 17–25 (outliers injected).
- `gender` (cat): Male / Female / Non-binary.
- `origin_region` (cat): Urban / Rural / Semi-urban.
- `hs_gpa` (float): high-school GPA on 0–4 scale (missing + some out-of-range via outliers).
- `admission_exam` (float): admission test score 0–100 (missing + outliers >100).
- `sem1_gpa` (float): first-semester GPA, 0–4 (missing + outliers >4).
- `socioeconomic_level` (cat): Low / Medium / High (has missing).
- `scholarship` (cat): Yes/No.
- `loan` (cat): Yes/No.
- `aid_amount` (float): scholarship amount in currency units (0 if no scholarship).
- `attendance_rate` (float): class attendance fraction in [0,1] (missing injected).
- `hours_studied_per_week` (float): typical weekly study hours (missing + negative outliers).
- `dropout` (int): 1 if student drops out, 0 otherwise; generated from a logistic function of predictors.

## Label Generation
Dropout probability is computed via a logistic model combining academic, financial, and engagement factors.
Lower GPA/attendance and low SES increase probability; scholarships/loans slightly reduce it.

## Missing Values
Missingness was injected at column-level rates:
- `hs_gpa` ~6%, `admission_exam` ~5%, `sem1_gpa` ~7%, `attendance_rate` ~4%, `hours_studied_per_week` ~3%, `socioeconomic_level` ~2%.

## Outliers
Injected deliberately to force cleaning steps:
- Ages shifted upward by +10 to +24 years for a small subset.
- Admission exam scores boosted beyond 100.
- Negative values in `hours_studied_per_week`.
- `sem1_gpa` values > 4.

## Suggested Tasks
- EDA: distributions, missingness map, outlier detection (IQR/Z-score).
- Preprocessing: impute missing values (median/mode), cap or remove outliers, encode categoricals.
- Modeling: train/test split, baseline logistic regression / tree-based models, evaluate with accuracy, ROC-AUC, F1.

Generated with Python (NumPy/Pandas), seed=42 for reproducibility.
"""
with open("README_student_dropout_synthetic.md", "w", encoding="utf-8") as f:
    f.write(readme)

print("Files saved:", csv_path, "| README_student_dropout_synthetic.md")


Files saved: student_dropout_synthetic.csv | README_student_dropout_synthetic.md
