# 01 - Data Ingestion

This notebook handles loading the raw dataset, performing initial cleaning, and saving a processed version.  
It serves as the first step in the ML workflow, producing clean data for downstream notebooks (EDA, feature engineering, modeling, etc.).


In [2]:
# import the required libraries

import pandas as pd
from pathlib import Path

# display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)

#suppress warnings
import warnings
warnings.filterwarnings('ignore')


In [3]:
# Paths to raw and processed data
RAW_PATH = Path("../data/raw/student_dropout.csv")
PROCESSED_PATH = Path("../data/processed/student_dropout_cleaned.csv")




In [4]:
# load raw data
df = pd.read_csv(RAW_PATH)
df.head(5)

Unnamed: 0,attendance_percent,test_score,discipline_count,parental_involvement,dropout_risk
0,79.93428306022466,76.35482924968028,5.0,2.0,1
1,,51.167520137335615,2.0,5.0,0
2,82.95377076201385,78.0440888015849,3.0,3.0,0
3,100.0,85.33456788207427,5.0,2.0,1
4,65.31693250553329,71.20152354835551,5.0,5.0,1


In [5]:
# display dataset info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   attendance_percent    570 non-null    object
 1   test_score            570 non-null    object
 2   discipline_count      571 non-null    object
 3   parental_involvement  570 non-null    object
 4   dropout_risk          600 non-null    int64 
dtypes: int64(1), object(4)
memory usage: 23.6+ KB


In [6]:
# summary statistics
df.describe(include='all')

Unnamed: 0,attendance_percent,test_score,discipline_count,parental_involvement,dropout_risk
count,570.0,570,571.0,570.0,600.0
unique,523.0,552,6.0,6.0,
top,100.0,??,5.0,4.0,
freq,37.0,12,120.0,121.0,
mean,,,,,0.648333
std,,,,,0.477889
min,,,,,0.0
25%,,,,,0.0
50%,,,,,1.0
75%,,,,,1.0


In [7]:
# Remove duplicates
df.drop_duplicates(inplace=True)

# Clean column names
df.columns = df.columns.str.strip().str.lower()

# Handle missing values (example for 'age')
if 'age' in df.columns:
    df['age'].fillna(df['age'].median(), inplace=True)

# Verify changes
print("Cleaned dataset shape:", df.shape)
display(df.head())


Cleaned dataset shape: (600, 5)


Unnamed: 0,attendance_percent,test_score,discipline_count,parental_involvement,dropout_risk
0,79.93428306022466,76.35482924968028,5.0,2.0,1
1,,51.167520137335615,2.0,5.0,0
2,82.95377076201385,78.0440888015849,3.0,3.0,0
3,100.0,85.33456788207427,5.0,2.0,1
4,65.31693250553329,71.20152354835551,5.0,5.0,1


In [8]:
PROCESSED_PATH.parent.mkdir(parents=True, exist_ok=True)

# Save cleaned data
df.to_csv(PROCESSED_PATH, index=False)
print(f"Processed data saved at: {PROCESSED_PATH}")


Processed data saved at: ..\data\processed\student_dropout_cleaned.csv


# Notes:
- Raw data is never modified; all changes are saved in `data/processed/`.
- This notebook can be rerun if the raw data is updated.
- Later notebooks (EDA, feature engineering) should use this processed data.
