In [140]:
import numpy as np
import pandas as pd
import random

def weighted_choice(choices):
    values, weights = zip(*choices)
    return random.choices(values, weights=weights, k=1)[0]

num_rows = 900  # Defining the number of rows
columns = [
    "Gender", "Age Group", "Living Situation", "Parent Education", "Financial Status", "Class Level", "Department",
    "Performance English", "Performance Maths", "Performance Biology", "Performance Physics", "Performance Chemistry",
    "Performance Literature", "Performance Government", "Performance CRS", "Performance Commerce", "Performance Accounting",
    "Performance Economics", "Study Hours", "Extra Tutoring", "School Attendance", "Experienced Bullying", "Peer Pressure",
    "Parents Meeting", "Confidence", "Counseling", "Motivation", "Last Exam", "Foundational Knowledge"
]

data = []
for _ in range(num_rows):
    gender = weighted_choice([("Male", 0.52), ("Female", 0.48)])
    
    class_level = weighted_choice([("SS1", 0.35), ("SS2", 0.4), ("SS3", 0.25)])
    age_group = weighted_choice([
        ("12 - 14", 0.3 if class_level == "SS1" else 0.1),
        ("15 - 16", 0.5 if class_level == "SS2" else 0.2),
        ("17 - 18", 0.6 if class_level == "SS3" else 0.3)
    ])
    
    living_situation = weighted_choice([("Both parents", 0.65), ("Single parent", 0.25), ("Guardian", 0.1)])
    parent_edu = weighted_choice([
        ("Post-graduate", 0.2), ("Higher Education", 0.35),
        ("Secondary", 0.3), ("Primary", 0.1), ("None", 0.05)
    ])
    
    financial_status = weighted_choice([
        ("Comfortable", 0.4 if parent_edu in ["Post-graduate", "Higher Education"] else 0.1),
        ("Basic needs met", 0.5 if parent_edu in ["Secondary", "Primary"] else 0.4),
        ("Struggle", 0.5 if parent_edu in ["Primary", "None"] else 0.2)
    ])
    
    department = weighted_choice([("Science", 0.45), ("Commercial", 0.3), ("Humanities", 0.25)])
    performance_levels = ["Excellent", "Good", "Fair", "Poor"]
    
    perf_english = weighted_choice([(p, 0.4 if p in ["Excellent", "Good"] else 0.2) for p in performance_levels])
    perf_maths = weighted_choice([(p, 0.35 if p in ["Excellent", "Good"] else 0.15) for p in performance_levels])
    perf_bio = weighted_choice([(p, 0.4 if department == "Science" else 0.2) for p in performance_levels])
    perf_physics = weighted_choice([(p, 0.4 if department == "Science" else 0.15) for p in performance_levels])
    perf_chemistry = weighted_choice([(p, 0.4 if department == "Science" else 0.15) for p in performance_levels])
    
    perf_lit = weighted_choice([(p, 0.5 if department == "Humanities" else 0.2) for p in performance_levels])
    perf_govt = weighted_choice([(p, 0.5 if department == "Humanities" else 0.2) for p in performance_levels])
    perf_crs = weighted_choice([(p, 0.5 if department == "Humanities" else 0.2) for p in performance_levels])
    
    perf_commerce = weighted_choice([(p, 0.5 if department == "Commercial" else 0.2) for p in performance_levels])
    perf_accounting = weighted_choice([(p, 0.5 if department == "Commercial" else 0.2) for p in performance_levels])
    perf_economics = weighted_choice([(p, 0.5 if department == "Commercial" else 0.2) for p in performance_levels])
    
    study_hours = weighted_choice([("<5 hrs", 0.4), ("5-10 hrs", 0.35), (">10 hrs", 0.25)])
    extra_tutoring = weighted_choice([("No", 0.5), ("Occasionally", 0.3), ("Regularly", 0.2)])
    school_attendance = weighted_choice([("Every day", 0.65), ("Most days", 0.25), ("Occasionally", 0.1)])
    bullying = weighted_choice([("Frequently", 0.2), ("Occasionally", 0.5), ("Never", 0.3)])
    peer_pressure = weighted_choice([("Frequently", 0.3), ("Occasionally", 0.5), ("Never", 0.2)])
    parents_meeting = weighted_choice([("Regularly", 0.5), ("Sometimes", 0.3), ("Never", 0.2)])
    confidence = weighted_choice([("High", 0.4), ("Moderate", 0.45), ("Low", 0.15)])
    counseling = weighted_choice([("Yes", 0.4), ("No", 0.4), ("Not sure", 0.2)])
    motivation = weighted_choice([("Knowledge", 0.4), ("Career", 0.35), ("Family", 0.2), ("Boredom", 0.05)])
    last_exam = weighted_choice([("70-100%", 0.3), ("60-69%", 0.4), ("40-59%", 0.2), ("<40%", 0.1)])
    foundational_knowledge = weighted_choice([("Strong", 0.3), ("Average", 0.5), ("Weak", 0.2)])
    
    data.append([gender, age_group, living_situation, parent_edu, financial_status, class_level, department,
                 perf_english, perf_maths, perf_bio, perf_physics, perf_chemistry, perf_lit, perf_govt, perf_crs,
                 perf_commerce, perf_accounting, perf_economics, study_hours, extra_tutoring, school_attendance,
                 bullying, peer_pressure, parents_meeting, confidence, counseling, motivation, last_exam, foundational_knowledge])

df_synthetic = pd.DataFrame(data, columns=columns)
df_synthetic.head()

Synthetic dataset generated and saved as synthetic_student_data.csv


In [141]:
print(df_synthetic.columns)

Index(['Gender', 'Age Group', 'Living Situation', 'Parent Education',
       'Financial Status', 'Class Level', 'Department', 'Performance English',
       'Performance Maths', 'Performance Biology', 'Performance Physics',
       'Performance Chemistry', 'Performance Literature',
       'Performance Government', 'Performance CRS', 'Performance Commerce',
       'Performance Accounting', 'Performance Economics', 'Study Hours',
       'Extra Tutoring', 'School Attendance', 'Experienced Bullying',
       'Peer Pressure', 'Parents Meeting', 'Confidence', 'Counseling',
       'Motivation', 'Last Exam', 'Foundational Knowledge'],
      dtype='object')


In [144]:
# Define the subjects with the correct prefix
general_subjects = ["Performance Maths", "Performance English"]
science_subjects = ["Performance Physics", "Performance Chemistry", "Performance Biology"]
commercial_subjects = ["Performance Commerce", "Performance Economics", "Performance Accounting"]
humanities_subjects = ["Performance Literature", "Performance Government", "Performance CRS"]

# Function to set "Nil" for subjects not related to a student's department
def set_nil(row):
    if row["Department"] == "Science":
        for subject in commercial_subjects + humanities_subjects:
            row[subject] = "Nil"
    elif row["Department"] == "Commercial":
        for subject in science_subjects + humanities_subjects:
            row[subject] = "Nil"
    elif row["Department"] == "Humanities":
        for subject in science_subjects + commercial_subjects:
            row[subject] = "Nil"
    return row

# Apply the function to the DataFrame
df_synthetic = df_synthetic.apply(set_nil, axis=1)

# Print the updated DataFrame
df_synthetic.head()

Unnamed: 0,Gender,Age Group,Living Situation,Parent Education,Financial Status,Class Level,Department,Performance English,Performance Maths,Performance Biology,...,Extra Tutoring,School Attendance,Experienced Bullying,Peer Pressure,Parents Meeting,Confidence,Counseling,Motivation,Last Exam,Foundational Knowledge
0,Female,12 - 14,Both parents,Post-graduate,Basic needs met,SS2,Humanities,Excellent,Excellent,Nil,...,Regularly,Most days,Occasionally,Never,Regularly,Moderate,Not sure,Family,70-100%,Average
1,Female,17 - 18,Single parent,Higher Education,Basic needs met,SS3,Science,Excellent,Excellent,Poor,...,No,Most days,Never,Occasionally,Sometimes,Low,No,Career,40-59%,Average
2,Female,17 - 18,Single parent,Higher Education,Basic needs met,SS1,Humanities,Good,Good,Nil,...,No,Every day,Never,Occasionally,Never,High,Not sure,Knowledge,60-69%,Strong
3,Female,15 - 16,Both parents,Secondary,Basic needs met,SS2,Science,Fair,Good,Excellent,...,Regularly,Most days,Occasionally,Occasionally,Regularly,High,No,Career,60-69%,Average
4,Female,15 - 16,Both parents,Primary,Struggle,SS3,Commercial,Poor,Fair,Nil,...,Occasionally,Every day,Never,Frequently,Sometimes,Moderate,Yes,Family,40-59%,Average


In [136]:
df_synthetic.shape

(900, 29)

In [138]:
# Save the dataset as an Excel file
file_name = '../data/add.csv'
df_synthetic.to_csv(file_name, index=False)

print(f"Data has been saved at {file_name}")

Data has been saved at ../data/add.csv
