In [31]:
# Import libraries
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv('../data/merged_dataset.csv')

In [32]:
# Basic info
print("Dataset Shape:", df.shape)
print("\nColumns:\n", df.columns)
print("\nData Types:\n", df.dtypes)

Dataset Shape: (14003, 16)

Columns:
 Index(['StudyHours', 'Attendance', 'Resources', 'Extracurricular',
       'Motivation', 'Internet', 'Gender', 'Age', 'LearningStyle',
       'OnlineCourses', 'Discussions', 'AssignmentCompletion', 'ExamScore',
       'EduTech', 'StressLevel', 'FinalGrade'],
      dtype='object')

Data Types:
 StudyHours              int64
Attendance              int64
Resources               int64
Extracurricular         int64
Motivation              int64
Internet                int64
Gender                  int64
Age                     int64
LearningStyle           int64
OnlineCourses           int64
Discussions             int64
AssignmentCompletion    int64
ExamScore               int64
EduTech                 int64
StressLevel             int64
FinalGrade              int64
dtype: object


In [33]:
# Quick look at first 5 rows
print("\nSample Data:\n", df.head())


Sample Data:
    StudyHours  Attendance  Resources  Extracurricular  Motivation  Internet  \
0          19          64          1                0           0         1   
1          19          64          1                0           0         1   
2          19          64          1                0           0         1   
3          19          64          1                1           0         1   
4          19          64          1                1           0         1   

   Gender  Age  LearningStyle  OnlineCourses  Discussions  \
0       0   19              2              8            1   
1       0   23              3             16            0   
2       0   28              1             19            0   
3       0   19              2              8            1   
4       0   23              3             16            0   

   AssignmentCompletion  ExamScore  EduTech  StressLevel  FinalGrade  
0                    59         40        0            1           3  
1

In [34]:
# Check for missing values
print("\nMissing Values:\n", df.isnull().sum())


Missing Values:
 StudyHours              0
Attendance              0
Resources               0
Extracurricular         0
Motivation              0
Internet                0
Gender                  0
Age                     0
LearningStyle           0
OnlineCourses           0
Discussions             0
AssignmentCompletion    0
ExamScore               0
EduTech                 0
StressLevel             0
FinalGrade              0
dtype: int64


In [35]:
# Basic statistics for numeric columns
print("\nStatistics:\n", df.describe())


Statistics:
          StudyHours    Attendance     Resources  Extracurricular  \
count  14003.000000  14003.000000  14003.000000     14003.000000   
mean      19.987431     80.194316      1.104406         0.594158   
std        5.890637     11.472181      0.697362         0.491072   
min        5.000000     60.000000      0.000000         0.000000   
25%       16.000000     70.000000      1.000000         0.000000   
50%       20.000000     80.000000      1.000000         1.000000   
75%       24.000000     90.000000      2.000000         1.000000   
max       44.000000    100.000000      2.000000         1.000000   

         Motivation      Internet        Gender           Age  LearningStyle  \
count  14003.000000  14003.000000  14003.000000  14003.000000   14003.000000   
mean       0.905806      0.925516      0.551953     23.532172       1.515461   
std        0.695896      0.262566      0.497311      3.514293       1.112941   
min        0.000000      0.000000      0.000000     1

In [36]:
# 1. Check for duplicates
print("Duplicates before:", df.duplicated().sum())
df.drop_duplicates(inplace=True)
print("Duplicates after:", df.duplicated().sum())

Duplicates before: 1534
Duplicates after: 0


In [37]:
# 2. Check for missing values
print("Missing values:\n", df.isnull().sum())
# Fill missing numeric values (if any) with column mean
df.fillna(df.mean(), inplace=True)

Missing values:
 StudyHours              0
Attendance              0
Resources               0
Extracurricular         0
Motivation              0
Internet                0
Gender                  0
Age                     0
LearningStyle           0
OnlineCourses           0
Discussions             0
AssignmentCompletion    0
ExamScore               0
EduTech                 0
StressLevel             0
FinalGrade              0
dtype: int64


In [38]:
# 3. Scaling features for clustering/statistics
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Features to scale
feature_cols = ['StudyHours', 'Attendance', 'Resources', 'Extracurricular', 
                'Motivation', 'Internet', 'OnlineCourses', 'Discussions', 
                'AssignmentCompletion', 'EduTech', 'StressLevel', 'Age']
# Z-score standardization
scaler_z = StandardScaler()
df_scaled_z = df.copy()
df_scaled_z[feature_cols] = scaler_z.fit_transform(df[feature_cols])

# Min-Max normalization for clustering
scaler_mm = MinMaxScaler()
df_scaled_mm = df.copy()
df_scaled_mm[feature_cols] = scaler_mm.fit_transform(df[feature_cols])

print("Data cleaning and scaling done!")

Data cleaning and scaling done!


In [39]:
df.to_csv('../data/merged_dataset_cleaned.csv', index=False)