In [14]:
import polars as pl
from pathlib import Path
import zipfile 

souce_path = Path("../Datasets/student_habits_vs_academic_performance.zip")
target_path = Path("../Datasets/student_habits_vs_academic_performance")
with zipfile.ZipFile(souce_path, "r") as zip_ref:
    zip_ref.extractall(target_path)

file_path = Path("../Datasets/student_habits_vs_academic_performance/student_habits_performance.csv")
df = pl.read_csv(file_path)

numerical_columns = [col for col, dtype in df.schema.items() if dtype == pl.Float64]
df = df.with_columns(
    [pl.col(col).round(0).cast(pl.Int64) for col in numerical_columns]
)

education_order = {
    "None": 0,
    "High School": 1,
    "Bachelor": 2,
    "Master": 3
}

df = df.with_columns(
    pl.col("parental_education_level").replace_strict(education_order).alias("parental_education_level_order")
)

df.write_csv(Path("../Datasets/student_habits_vs_academic_performance/student_habits_performance_preprocess_v1.csv"), separator=";")


In [15]:
df.columns

['student_id',
 'age',
 'gender',
 'study_hours_per_day',
 'social_media_hours',
 'netflix_hours',
 'part_time_job',
 'attendance_percentage',
 'sleep_hours',
 'diet_quality',
 'exercise_frequency',
 'parental_education_level',
 'internet_quality',
 'mental_health_rating',
 'extracurricular_participation',
 'exam_score',
 'parental_education_level_order']

In [12]:
df["parental_education_level"].value_counts()

parental_education_level,count
str,u32
"""Master""",167
"""High School""",392
"""None""",91
"""Bachelor""",350


In [3]:
df.select(pl.all().n_unique())

student_id,age,gender,study_hours_per_day,social_media_hours,netflix_hours,part_time_job,attendance_percentage,sleep_hours,diet_quality,exercise_frequency,parental_education_level,internet_quality,mental_health_rating,extracurricular_participation,exam_score
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
1000,8,3,78,60,51,2,320,68,3,7,4,3,10,2,480


In [4]:
df.null_count()

student_id,age,gender,study_hours_per_day,social_media_hours,netflix_hours,part_time_job,attendance_percentage,sleep_hours,diet_quality,exercise_frequency,parental_education_level,internet_quality,mental_health_rating,extracurricular_participation,exam_score
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [5]:
df.filter(df.is_duplicated())

student_id,age,gender,study_hours_per_day,social_media_hours,netflix_hours,part_time_job,attendance_percentage,sleep_hours,diet_quality,exercise_frequency,parental_education_level,internet_quality,mental_health_rating,extracurricular_participation,exam_score
str,i64,str,f64,f64,f64,str,f64,f64,str,i64,str,str,i64,str,f64
