# Notebook 03: Data Normalization & Cleaning

Purpose:
- Normalize time units and activity representations
- Apply minimal cleaning for consistency
- Prepare data for analytics and ML stages

In [1]:
import pandas as pd
import numpy as np

df_study = pd.read_csv("../data/raw/student_study_habits.csv")
df_habits = pd.read_csv("../data/raw/enhanced_student_habits_performance_dataset.csv")
df_time = pd.read_csv("../data/raw/Time Management and Productivity Insights.csv")

In [2]:
# Time Normalization:- All time must become minutes

df_study["study_minutes_per_week"]=df_study["study_hours_per_week"]*60
df_study["sleep_minutes_per_day"]=df_study["sleep_hours_per_day"]*60
df_time["work_minutes"]=df_time["Daily Work Hours"]*60
df_time["sleep_minutes"]=df_time["Daily Sleep Hours"]*60
df_time["leisure_minutes"]=df_time["Daily Leisure Hours"]*60
df_time["exercise_minutes"]=df_time["Daily Exercise Minutes"]
df_time["commute_minutes"] = df_time["Commute Time (hours)"] * 60

In [3]:
df_study["extracurricular_activity"] = np.where(
    df_study["extracurricular_Yes"] == 1,
    "Extracurricular",
    None
)

df_study["part_time_activity"] = np.where(
    df_study["part_time_job_Yes"] == 1,
    "Part-Time Work",
    None
)

In [4]:
df_study.rename(columns={"final_grade": "exam_score"}, inplace=True)
df_time.rename(columns={"Productivity Score": "productivity_score"}, inplace=True)

In [5]:
df_study.isna().sum()
df_time.isna().sum()
df_habits.isna().sum()

student_id                       0
age                              0
gender                           0
major                            0
study_hours_per_day              0
social_media_hours               0
netflix_hours                    0
part_time_job                    0
attendance_percentage            0
sleep_hours                      0
diet_quality                     0
exercise_frequency               0
parental_education_level         0
internet_quality                 0
mental_health_rating             0
extracurricular_participation    0
previous_gpa                     0
semester                         0
stress_level                     0
dropout_risk                     0
social_activity                  0
screen_time                      0
study_environment                0
access_to_tutoring               0
family_income_range              0
parental_support_level           0
motivation_level                 0
exam_anxiety_score               0
learning_style      

In [6]:
df_study[["study_minutes_per_week", "sleep_minutes_per_day"]].describe()
df_time[["work_minutes", "sleep_minutes", "leisure_minutes", "exercise_minutes"]].describe()

Unnamed: 0,work_minutes,sleep_minutes,leisure_minutes,exercise_minutes
count,85.0,85.0,85.0,85.0
mean,421.270588,426.705882,255.882353,64.611765
std,91.403786,47.011352,69.440864,27.006654
min,204.0,312.0,102.0,6.0
25%,354.0,396.0,204.0,45.0
50%,426.0,426.0,258.0,65.0
75%,492.0,462.0,306.0,88.0
max,570.0,528.0,384.0,120.0


## Normalization Summary

- All time-based columns have been converted to minutes
- Activity presence has been expanded into semantic labels
- Outcome columns have consistent naming
- No aggressive cleaning has been applied
- Data is ready for feature engineering