In [94]:
import numpy as np
import pandas as pd
import random

def weighted_choice(choices):
    values, weights = zip(*choices)
    return random.choices(values, weights=weights, k=1)[0]

def generate_synthetic_data(real_df, num_rows=900):
    synthetic_data = []
    columns = real_df.columns.tolist()
    
    for _ in range(num_rows):
        row = []
        for col in columns:
            if real_df[col].dtype == 'object':  # Categorical data (including subject scores)
                value_counts = real_df[col].value_counts(normalize=True)
                choices = [val for val in value_counts.index if val != "Unknown"]  # Remove "Unknown"
                weights = [value_counts[val] for val in choices]  # Keep valid weights
                row.append(weighted_choice(zip(choices, weights)) if choices else "Unknown")  # Fallback

            else:  # Numerical data (if any)
                mean, std = real_df[col].mean(), real_df[col].std()
                row.append(np.random.normal(mean, std))
        
        synthetic_data.append(row)
    
    synthetic_df = pd.DataFrame(synthetic_data, columns=columns)
    return synthetic_df


In [96]:
# Example usage:
real_df = pd.read_csv('../data/processedNoAdd_ML_data.csv')
synthetic_df = generate_synthetic_data(real_df, num_rows=900)

In [97]:
synthetic_df.columns

Index(['Gender', 'Age Group', 'Living Situation', 'Parent Education Level',
       'Family Financial Status', 'Class', 'Department',
       'Performance in English', 'Performance in Maths',
       'Performance in Biology', 'Performance in Physics',
       'Performance in Chemistry', 'Performance in Lit in English',
       'Performance in Government', 'Performance in CRS',
       'Performance in Commerce', 'Performance in Accounting',
       'Performance in Economics', 'Study Hours per Week', 'Extra Tutoring',
       'School Attendance', 'Experienced Bullying', 'Peer Pressure',
       'Parents Attend Parent-Teacher Meeting',
       'Confidence in Academic Ability', 'Access to Counseling',
       'Motivation for Attending School', 'Last Exam Performance',
       'Foundational Knowledge'],
      dtype='object')

In [100]:
# Define subjects with correct names
general_subjects = ["Performance in Maths", "Performance in English"]
science_subjects = ["Performance in Physics", "Performance in Chemistry", "Performance in Biology"]
commercial_subjects = ["Performance in Commerce", "Performance in Economics", "Performance in Accounting"]
humanities_subjects = ["Performance in Lit in English", "Performance in Government", "Performance in CRS"]

# Function to set "Nil" for non-relevant subjects
def set_nil(row):
    if row["Department"] == "Science":
        for subject in commercial_subjects + humanities_subjects:
            row[subject] = "Nil"
    elif row["Department"] == "Commercial":
        for subject in science_subjects + humanities_subjects:
            row[subject] = "Nil"
    elif row["Department"] == "Humanities":
        for subject in science_subjects + commercial_subjects:
            row[subject] = "Nil"
    return row

# Apply function
df_synthetic = df_synthetic.apply(set_nil, axis=1)

# Reorder columns to match original structure
correct_column_order = [
    'Gender', 'Age Group', 'Living Situation', 'Parent Education Level',
    'Family Financial Status', 'Class', 'Department',
    'Performance in English', 'Performance in Maths',
    'Performance in Biology', 'Performance in Physics',
    'Performance in Chemistry', 'Performance in Lit in English',
    'Performance in Government', 'Performance in CRS',
    'Performance in Commerce', 'Performance in Accounting',
    'Performance in Economics', 'Study Hours per Week', 'Extra Tutoring',
    'School Attendance', 'Experienced Bullying', 'Peer Pressure',
    'Parents Attend Parent-Teacher Meeting',
    'Confidence in Academic Ability', 'Access to Counseling',
    'Motivation for Attending School', 'Last Exam Performance',
    'Foundational Knowledge'
]

df_synthetic = df_synthetic[correct_column_order]

# Display the first rows
df_synthetic.head()

Unnamed: 0,Gender,Age Group,Living Situation,Parent Education Level,Family Financial Status,Class,Department,Performance in English,Performance in Maths,Performance in Biology,...,Extra Tutoring,School Attendance,Experienced Bullying,Peer Pressure,Parents Attend Parent-Teacher Meeting,Confidence in Academic Ability,Access to Counseling,Motivation for Attending School,Last Exam Performance,Foundational Knowledge
0,Female,15 - 16,Both parents,Secondary school,We struggle to meet basic needs,SS1,Science,Good,Excellent,Good,...,No,Every day,Never,Occasionally,Regularly,Somewhat confident,No,To learn and gain knowledge,40% - 59%,Average
1,Male,12 - 14,Guardian,Secondary school,We can afford some luxuries,SS3,Science,Good,Good,Unknown,...,Regularly,Every day,Occasionally,Frequently,Never,Somewhat confident,Yes,To learn and gain knowledge,70% - 100%,Strong
2,Female,15 - 16,Both parents,Higher Education,We meet our needs but can't afford luxuries,SS3,Humanities,Good,Good,Nil,...,No,Every day,Never,Never,Sometimes,Somewhat confident,Yes,To prepare for a good career,60% - 69%,Average
3,Female,15 - 16,Both parents,Post-graduate education,We meet our needs but can't afford luxuries,SS3,Humanities,Good,Excellent,Nil,...,No,Every day,Frequently,Never,Never,Somewhat confident,I'm not sure,To prepare for a good career,40% - 59%,Average
4,Female,15 - 16,Both parents,Secondary school,We can afford some luxuries,SS2,Humanities,Good,Fair,Nil,...,Regularly,Every day,Never,Never,Sometimes,Not confident,I'm not sure,To learn and gain knowledge,60% - 69%,Average


In [102]:
df_synthetic.columns

Index(['Gender', 'Age Group', 'Living Situation', 'Parent Education Level',
       'Family Financial Status', 'Class', 'Department',
       'Performance in English', 'Performance in Maths',
       'Performance in Biology', 'Performance in Physics',
       'Performance in Chemistry', 'Performance in Lit in English',
       'Performance in Government', 'Performance in CRS',
       'Performance in Commerce', 'Performance in Accounting',
       'Performance in Economics', 'Study Hours per Week', 'Extra Tutoring',
       'School Attendance', 'Experienced Bullying', 'Peer Pressure',
       'Parents Attend Parent-Teacher Meeting',
       'Confidence in Academic Ability', 'Access to Counseling',
       'Motivation for Attending School', 'Last Exam Performance',
       'Foundational Knowledge'],
      dtype='object')

In [104]:
df_synthetic.to_csv('synthetic_data.csv', index=False)

In [85]:
df_synthetic.head()

Unnamed: 0,Gender,Age Group,Living Situation,Parent Education Level,Family Financial Status,Class,Department,Performance in English,Performance in Maths,Performance in Biology,...,Extra Tutoring,School Attendance,Experienced Bullying,Peer Pressure,Parents Attend Parent-Teacher Meeting,Confidence in Academic Ability,Access to Counseling,Motivation for Attending School,Last Exam Performance,Foundational Knowledge
0,Female,15 - 16,Both parents,Secondary school,We struggle to meet basic needs,SS1,Science,Good,Excellent,Good,...,No,Every day,Never,Occasionally,Regularly,Somewhat confident,No,To learn and gain knowledge,40% - 59%,Average
1,Male,12 - 14,Guardian,Secondary school,We can afford some luxuries,SS3,Science,Good,Good,Unknown,...,Regularly,Every day,Occasionally,Frequently,Never,Somewhat confident,Yes,To learn and gain knowledge,70% - 100%,Strong
2,Female,15 - 16,Both parents,Higher Education,We meet our needs but can't afford luxuries,SS3,Humanities,Good,Good,Nil,...,No,Every day,Never,Never,Sometimes,Somewhat confident,Yes,To prepare for a good career,60% - 69%,Average
3,Female,15 - 16,Both parents,Post-graduate education,We meet our needs but can't afford luxuries,SS3,Humanities,Good,Excellent,Nil,...,No,Every day,Frequently,Never,Never,Somewhat confident,I'm not sure,To prepare for a good career,40% - 59%,Average
4,Female,15 - 16,Both parents,Secondary school,We can afford some luxuries,SS2,Humanities,Good,Fair,Nil,...,Regularly,Every day,Never,Never,Sometimes,Not confident,I'm not sure,To learn and gain knowledge,60% - 69%,Average


In [92]:
df_synthetic.shape

(900, 29)