In [10]:
import os
import pandas as pd
import numpy as np
import random

# Reproducibility
np.random.seed(42)
random.seed(42)

# Ensure directory exists
os.makedirs("data/raw", exist_ok=True)


In [12]:
# Ethiopian-style names (with yours included)
ethiopian_names = [
    "Biko", "Jo", "Hayu", "Tolasa", "Abdi", "Abel", "Bethelhem", "Tewodros",
    "Lami", "Firo", "Yonas", "Lemlem", "Nahom", "Ruth", "Mahi", "Daniel",
    "Mulu", "Saron", "Haile", "Biruk", "Rediet", "Samuel", "Hermela", "Solomon",
    "Yohannes", "Rahel", "Marta", "Fikir", "Kalkidan", "Mikiyas"
]

# Create 300 synthetic students
n = 300
students = random.choices(ethiopian_names, k=n)

# Generate synthetic features
data = pd.DataFrame({
    "student_name": students,
    "avg_class_length": np.random.randint(60, 180, n),
    "class_start_time": np.random.choice(["morning", "afternoon", "evening"], n),
    "teacher_feedback_score": np.random.randint(1, 6, n),
    "num_assignments_due": np.random.randint(0, 5, n),
    "previous_engagement": np.random.uniform(0.2, 1.0, n),
    "previous_absences": np.random.randint(0, 10, n),
})


In [13]:
# Target variable: engagement level (rule-based)
data["engagement_level"] = (
    (data["teacher_feedback_score"] >= 3)
    & (data["previous_engagement"] > 0.6)
    & (data["avg_class_length"] <= 120)
    & (data["class_start_time"] != "afternoon")
).astype(int)

# Save dataset
output_path = "../data/raw/student_engagement.csv"
data.to_csv(output_path, index=False)

print("âœ… Synthetic dataset generated successfully!")
print(f"ðŸ“‚ Saved to: {output_path}")
print("\nSample data preview:\n")
print(data.head(10))



âœ… Synthetic dataset generated successfully!
ðŸ“‚ Saved to: ../data/raw/student_engagement.csv

Sample data preview:

  student_name  avg_class_length class_start_time  teacher_feedback_score  \
0        Fikir               102          evening                       2   
1         Mulu               135        afternoon                       5   
2        Rahel                60        afternoon                       5   
3        Saron               116          morning                       4   
4         Abdi               123          morning                       1   
5       Tolasa               177        afternoon                       3   
6         Firo               143          morning                       1   
7        Marta                87        afternoon                       4   
8      Solomon               166          evening                       3   
9        Rahel                96          morning                       5   

   num_assignments_due  previous_

In [14]:
data["engagement_level"].value_counts()


engagement_level
0    271
1     29
Name: count, dtype: int64