In [13]:
import random
import string
import pandas as pd
from datetime import datetime

# Main configurations

num_students=5000
id_len=8
output_dir=r"C:\PythonCodes\studentDataCreator"

random.seed(42)

# Utilities

def generate_id(existing_ids, length):
    while True:
        sid=''.join(random.choices(string.ascii_uppercase+string.digits, k=length))
        if sid not in existing_ids:
            existing_ids.add(sid)
            return sid

        
def weighted_choice(choices):
    values, weights=zip(*choices)
    return random.choices(values, weights=weights, k=1)[0]

# Functions to generate features

def generate_gender():
    return weighted_choice([('M', 0.49),
                            ('F', 0.49),
                            ('U', 0.02)])

def generate_school_type():
    return weighted_choice([('High', 0.35),
                            ('Medium', 0.45),
                            ('Low', 0.20)])

def parental_involvement():
    return weighted_choice([('High', 0.4),
                            ('Medium', 0.45),
                            ('Low', 0.20)])

def peer_influence():
    return weighted_choice([('Positve', 0.4),
                            ('Neutral', 0.35),
                            ('Negative', 0.35)])

def learning_disability():
    return "Y" if random.random() < 0.12 else "N"

def generate_attendance(ld_flag):
    base=random.uniform(75, 98)
    if ld_flag == 'Y':
        base -= random.uniform(5, 15)
    return round(max(base, 50), 1)

def generate_tutoring_sessions(parental):
    if parental == "High":
        return random.randint(5, 15)
    elif parental == "Medium":
        return random.randint(2, 7)
    else:
        return random.randint(0, 3)

    
def household_income(school_type):
    if school_type == "Private":
        return random.randint(70000, 160000)
    elif school_type == "Public":
        return random.randint(20000, 80000)
    else:
        return random.randint(25000, 60000)

    
def distance_from_home():
    base = random.random()
    if base < 0.6:
        return round(random.uniform(0.5, 5), 2)
    elif base < 0.9:
        return round(random.uniform(5, 15), 2)
    else:
        return round(random.uniform(15, 30), 2)

    
# Generate performance level metrics

def student_grade(attendance, parental, tutoring, peer, ld_flag):
    score = 50

    score += (attendance - 75) * 0.4

    if parental == "High":
        score += 10
    elif parental == "Medium":
        score += 5

    score += tutoring * 0.8

    if peer == "Positive":
        score += 6
    elif peer == "Negative":
        score -= 6

    if ld_flag == "Y":
        score -= 8

    noise = random.uniform(-5, 5)
    score += noise

    return round(min(max(score, 0), 100), 1)

# Create the dataset
student_ids=set()
records=[]

   
for _ in range(num_students):
    student_id=generate_id(student_ids, id_len)
    gender=generate_gender()
    school_type=generate_school_type()
    parental=parental_involvement()
    peer=peer_influence()
    ld_flag=learning_disability()
    
    attendance=generate_attendance(ld_flag)
    tutoring=tutoring_sessions(parental)
    income=household_income(school_type)
    distance=distance_from_home() 
    grade=student_grade(
        attendance,
        parental,
        tutoring,
        peer,
        ld_flag)
    
    records.append([student_id,
                gender,
                attendance,
                ld_flag,
                grade,
                parental,
                tutoring,
                income,
                school_type,
                distance,
                peer])
    columns=['Student_ID',
             'Gender',
             'Attendance_Percentage',
             'Learning_Disability',
             'Final_Grade',
             'Parental_Involvement',
             'Tutoring_Sessions',
             'Household_Income',
             'School_Type',
             'Distance_From_Home_km',
             'Peer_Influence']

df1=pd.DataFrame(records, columns=columns)
df1



Unnamed: 0,Student_ID,Gender,Attendance_Percentage,Learning_Disability,Final_Grade,Parental_Involvement,Tutoring_Sessions,Household_Income,School_Type,Distance_From_Home_km,Peer_Influence
0,XAJI0Y6D,M,68.1,Y,62.7,High,13,52493,High,3.15,Neutral
1,A3ZMF8MD,M,87.3,N,59.1,Medium,5,30164,Low,4.23,Negative
2,5UZBIKCI,M,79.8,N,62.4,Medium,4,29679,High,6.71,Neutral
3,FN9XUY41,M,96.7,N,80.8,High,10,38934,High,8.96,Positve
4,QJIUJV6O,M,66.2,Y,50.0,Medium,7,52666,Low,2.23,Positve
...,...,...,...,...,...,...,...,...,...,...,...
4995,2ODUXJTL,M,89.2,N,61.7,Medium,5,40774,Low,4.10,Neutral
4996,WSODB3TO,F,87.4,N,73.9,High,13,36822,High,4.71,Positve
4997,WWDO1R71,F,89.1,N,53.9,Medium,4,54722,High,2.04,Negative
4998,7IYI8XDU,F,91.3,N,70.5,Medium,7,56375,Medium,1.11,Neutral


In [14]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Student_ID             5000 non-null   object 
 1   Gender                 5000 non-null   object 
 2   Attendance_Percentage  5000 non-null   float64
 3   Learning_Disability    5000 non-null   object 
 4   Final_Grade            5000 non-null   float64
 5   Parental_Involvement   5000 non-null   object 
 6   Tutoring_Sessions      5000 non-null   int64  
 7   Household_Income       5000 non-null   int64  
 8   School_Type            5000 non-null   object 
 9   Distance_From_Home_km  5000 non-null   float64
 10  Peer_Influence         5000 non-null   object 
dtypes: float64(3), int64(2), object(6)
memory usage: 429.8+ KB
