
# Understanding the Data

This notebook explores the dataset `Student_performance_data.csv` and performs basic data understanding as part of the project.

## 1. Load Libraries and Data


In [1]:

import pandas as pd

# Load the dataset
df = pd.read_csv("../data/Student_performance_data.csv")
df.head()


Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
0,1001,17,1,0,2,19.833723,7,1,2,0,0,1,0,2.929196,2.0
1,1002,18,0,0,1,15.408756,0,0,1,0,0,0,0,3.042915,1.0
2,1003,15,0,2,3,4.21057,26,0,2,0,0,0,0,0.112602,4.0
3,1004,17,1,0,3,10.028829,14,0,3,1,0,0,0,2.054218,3.0
4,1005,17,1,0,2,4.672495,17,1,3,0,0,0,0,1.288061,4.0


# Dataset Overview

- Rows (Students): 2,392
- Columns (Features): 15
- Missing Values: None — the dataset is complete.

In [11]:

# Shape of the dataset
df.shape


(2392, 15)

In [12]:

# Column names
df.columns.tolist()


['StudentID',
 'Age',
 'Gender',
 'Ethnicity',
 'ParentalEducation',
 'StudyTimeWeekly',
 'Absences',
 'Tutoring',
 'ParentalSupport',
 'Extracurricular',
 'Sports',
 'Music',
 'Volunteering',
 'GPA',
 'GradeClass']

In [None]:

# data types
df.dtypes


StudentID              int64
Age                    int64
Gender                 int64
Ethnicity              int64
ParentalEducation      int64
StudyTimeWeekly      float64
Absences               int64
Tutoring               int64
ParentalSupport        int64
Extracurricular        int64
Sports                 int64
Music                  int64
Volunteering           int64
GPA                  float64
GradeClass           float64
dtype: object

In [None]:

# missing values per columnn
df.isnull().sum()


StudentID            0
Age                  0
Gender               0
Ethnicity            0
ParentalEducation    0
StudyTimeWeekly      0
Absences             0
Tutoring             0
ParentalSupport      0
Extracurricular      0
Sports               0
Music                0
Volunteering         0
GPA                  0
GradeClass           0
dtype: int64

In [None]:

# summary stats for numerical features
df.describe()


Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
count,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0
mean,2196.5,16.468645,0.51087,0.877508,1.746237,9.771992,14.541388,0.301421,2.122074,0.383361,0.303512,0.196906,0.157191,1.906186,2.983696
std,690.655244,1.123798,0.499986,1.028476,1.000411,5.652774,8.467417,0.458971,1.122813,0.486307,0.45987,0.397744,0.364057,0.915156,1.233908
min,1001.0,15.0,0.0,0.0,0.0,0.001057,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1598.75,15.0,0.0,0.0,1.0,5.043079,7.0,0.0,1.0,0.0,0.0,0.0,0.0,1.174803,2.0
50%,2196.5,16.0,1.0,0.0,2.0,9.705363,15.0,0.0,2.0,0.0,0.0,0.0,0.0,1.893393,4.0
75%,2794.25,17.0,1.0,2.0,2.0,14.40841,22.0,1.0,3.0,1.0,1.0,0.0,0.0,2.622216,4.0
max,3392.0,18.0,1.0,3.0,4.0,19.978094,29.0,1.0,4.0,1.0,1.0,1.0,1.0,4.0,4.0


In [None]:

# summary stats for all feature
df.describe(include='all').transpose()


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
StudentID,2392.0,2196.5,690.655244,1001.0,1598.75,2196.5,2794.25,3392.0
Age,2392.0,16.468645,1.123798,15.0,15.0,16.0,17.0,18.0
Gender,2392.0,0.51087,0.499986,0.0,0.0,1.0,1.0,1.0
Ethnicity,2392.0,0.877508,1.028476,0.0,0.0,0.0,2.0,3.0
ParentalEducation,2392.0,1.746237,1.000411,0.0,1.0,2.0,2.0,4.0
StudyTimeWeekly,2392.0,9.771992,5.652774,0.001057,5.043079,9.705363,14.40841,19.978094
Absences,2392.0,14.541388,8.467417,0.0,7.0,15.0,22.0,29.0
Tutoring,2392.0,0.301421,0.458971,0.0,0.0,0.0,1.0,1.0
ParentalSupport,2392.0,2.122074,1.122813,0.0,1.0,2.0,3.0,4.0
Extracurricular,2392.0,0.383361,0.486307,0.0,0.0,0.0,1.0,1.0


In [None]:

# unique value counts for categorical columns
categorical_columns = [
    "Gender", "Ethnicity", "ParentalEducation", "Tutoring",
    "ParentalSupport", "Extracurricular", "Sports", "Music",
    "Volunteering", "GradeClass"
]

for col in categorical_columns:
    print(f"{col}:\n{df[col].value_counts()}\n")


Gender:
Gender
1    1222
0    1170
Name: count, dtype: int64

Ethnicity:
Ethnicity
0    1207
1     493
2     470
3     222
Name: count, dtype: int64

ParentalEducation:
ParentalEducation
2    934
1    728
3    367
0    243
4    120
Name: count, dtype: int64

Tutoring:
Tutoring
0    1671
1     721
Name: count, dtype: int64

ParentalSupport:
ParentalSupport
2    740
3    697
1    489
4    254
0    212
Name: count, dtype: int64

Extracurricular:
Extracurricular
0    1475
1     917
Name: count, dtype: int64

Sports:
Sports
0    1666
1     726
Name: count, dtype: int64

Music:
Music
0    1921
1     471
Name: count, dtype: int64

Volunteering:
Volunteering
0    2016
1     376
Name: count, dtype: int64

GradeClass:
GradeClass
4.0    1211
3.0     414
2.0     391
1.0     269
0.0     107
Name: count, dtype: int64

