# Step 1: Load Dataset
Load the CSV dataset into a pandas DataFrame.


In [5]:
import pandas as pd

df = pd.read_csv(r"C:\Users\bhanu\Desktop\Student\student_performance_data.csv")
df.head()


Unnamed: 0,StudentID,Age,Gender,SocioeconomicStatus,Grades,Attendance,TimeSpentOnHomework,ClassParticipation,AcademicPerformanceStatus
0,S0001,21,Female,High,73.821849,99.578045,3.153141,Medium,Pass
1,S0002,18,Female,Low,61.30145,95.580772,3.160019,Medium,Pass
2,S0003,19,Female,Low,82.023802,74.858691,1.594093,Low,Pass
3,S0004,21,Male,Low,98.949056,67.806661,1.937389,Medium,Pass
4,S0005,17,Female,Middle,80.174655,79.575701,1.233916,High,Pass


# Step 2: Check Dataset Shape and Info
Understand the size and data types of the dataset.


In [6]:
print("Dataset shape:", df.shape)
df.info()


Dataset shape: (1000, 9)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   StudentID                  1000 non-null   object 
 1   Age                        1000 non-null   int64  
 2   Gender                     1000 non-null   object 
 3   SocioeconomicStatus        1000 non-null   object 
 4   Grades                     1000 non-null   float64
 5   Attendance                 1000 non-null   float64
 6   TimeSpentOnHomework        1000 non-null   float64
 7   ClassParticipation         1000 non-null   object 
 8   AcademicPerformanceStatus  1000 non-null   object 
dtypes: float64(3), int64(1), object(5)
memory usage: 70.4+ KB


# Step 3: Check for Missing Values
Verify if any columns have missing data.


In [7]:
df.isnull().sum()


StudentID                    0
Age                          0
Gender                       0
SocioeconomicStatus          0
Grades                       0
Attendance                   0
TimeSpentOnHomework          0
ClassParticipation           0
AcademicPerformanceStatus    0
dtype: int64

# Step 4: Summary Statistics of Numerical Features
Get a statistical summary for numerical columns.


In [8]:
df.describe()


Unnamed: 0,Age,Grades,Attendance,TimeSpentOnHomework
count,1000.0,1000.0,1000.0,1000.0
mean,17.96,74.892793,79.448395,2.245859
std,2.0036,14.428974,11.417765,1.009563
min,15.0,50.000582,60.001229,0.500795
25%,16.0,62.442623,69.451976,1.400813
50%,18.0,75.155411,79.329503,2.191778
75%,20.0,87.201111,89.137688,3.131317
max,21.0,99.944631,99.982308,3.995775


# Step 5: Encoding Categorical Variables
Encode categorical columns into numeric values using Label Encoding:
- Gender
- SocioeconomicStatus
- ClassParticipation
- AcademicPerformanceStatus (target)


In [11]:
from sklearn.preprocessing import LabelEncoder

categorical_cols = ['Gender', 'SocioeconomicStatus', 'ClassParticipation', 'AcademicPerformanceStatus']
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

df.head()


Unnamed: 0,StudentID,Age,Gender,SocioeconomicStatus,Grades,Attendance,TimeSpentOnHomework,ClassParticipation,AcademicPerformanceStatus
0,S0001,21,0,0,73.821849,99.578045,3.153141,2,1
1,S0002,18,0,1,61.30145,95.580772,3.160019,2,1
2,S0003,19,0,1,82.023802,74.858691,1.594093,1,1
3,S0004,21,1,1,98.949056,67.806661,1.937389,2,1
4,S0005,17,0,2,80.174655,79.575701,1.233916,0,1


# Step 6: Drop Unnecessary Columns
Remove the 'StudentID' column as it is a unique identifier and does not contribute to the prediction.


In [12]:
df.drop(columns=['StudentID'], inplace=True)


# Step 7: Split Features and Target Variable
Separate independent features (X) and dependent target (y).


In [13]:
X = df.drop('AcademicPerformanceStatus', axis=1)
y = df['AcademicPerformanceStatus']


# Step 8: Split Dataset into Training and Test Sets
Split data into training and testing sets (80%-20%) with stratification to preserve class distribution.


In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)


# Step 9: Scale Numerical Features
Scale features using StandardScaler to standardize the range of independent variables.


In [16]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
