## 4. Data Cleaning & Transformation

* 4.1 Standardize Column Names 

In [1]:
import pandas as pd

# Load your data
df = pd.read_csv("C:\\Users\\student\\Downloads\\DataSet StudentsPerformances.csv")

# Make all column names lowercase, remove extra spaces, and replace spaces with underscores
df.columns = df.columns.str.lower().str.strip().str.replace(' ', '_')

# View the new column names
print(df.columns)


Index(['gender', 'race/ethnicity', 'parental_level_of_education', 'lunch',
       'test_preparation_course', 'math_score', 'reading_score',
       'writing_score'],
      dtype='object')


* 4.2 Convert Data Types

In [5]:
# Convert scores to integers
score_cols = ['math_score', 'reading_score', 'writing_score']
df[score_cols] = df[score_cols].astype(int)

# Convert categorical text columns to 'category' type to save memory
cat_cols = ['gender', 'race/ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course']
df[cat_cols] = df[cat_cols].astype('category')


In [11]:

print(df.dtypes)


gender                         category
race/ethnicity                 category
parental_level_of_education    category
lunch                          category
test_preparation_course        category
math_score                        int32
reading_score                     int32
writing_score                     int32
dtype: object


In [10]:

print(df.head())


   gender race/ethnicity parental_level_of_education         lunch  \
0  female        group B           bachelor's degree      standard   
1  female        group C                some college      standard   
2  female        group B             master's degree      standard   
3    male        group A          associate's degree  free/reduced   
4    male        group C                some college      standard   

  test_preparation_course  math_score  reading_score  writing_score  
0                    none          72             72             74  
1               completed          69             90             88  
2                    none          90             95             93  
3                    none          47             57             44  
4                    none          76             78             75  


In [9]:

df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   gender                       1000 non-null   category
 1   race/ethnicity               1000 non-null   category
 2   parental_level_of_education  1000 non-null   category
 3   lunch                        1000 non-null   category
 4   test_preparation_course      1000 non-null   category
 5   math_score                   1000 non-null   int32   
 6   reading_score                1000 non-null   int32   
 7   writing_score                1000 non-null   int32   
dtypes: category(5), int32(3)
memory usage: 17.5 KB


* 4.3 Feature Engineering


In [13]:
# Create a total score column by adding all three test scores
df['total_score'] = df['math_score'] + df['reading_score'] + df['writing_score']

# Create an average score column
df['average_score'] = df['total_score'] / 3

# Create a 'pass/fail' flag (e.g., passing if average score is 40 or higher)
df['status'] = df['average_score'].apply(lambda x: 'Pass' if x >= 40 else 'Fail')


* 4.4 Final Validation


In [14]:
# CHECKING NULL VALUES
print(df.isnull().sum())

print(df.head())

print(df.info())


gender                         0
race/ethnicity                 0
parental_level_of_education    0
lunch                          0
test_preparation_course        0
math_score                     0
reading_score                  0
writing_score                  0
total_score                    0
average_score                  0
status                         0
dtype: int64
   gender race/ethnicity parental_level_of_education         lunch  \
0  female        group B           bachelor's degree      standard   
1  female        group C                some college      standard   
2  female        group B             master's degree      standard   
3    male        group A          associate's degree  free/reduced   
4    male        group C                some college      standard   

  test_preparation_course  math_score  reading_score  writing_score  \
0                    none          72             72             74   
1               completed          69             90         

In [17]:
df.describe()

Unnamed: 0,math_score,reading_score,writing_score,total_score,average_score
count,1000.0,1000.0,1000.0,1000.0,1000.0
mean,66.089,69.169,68.054,203.312,67.770667
std,15.16308,14.600192,15.195657,42.771978,14.257326
min,0.0,17.0,10.0,27.0,9.0
25%,57.0,59.0,57.75,175.0,58.333333
50%,66.0,70.0,69.0,205.0,68.333333
75%,77.0,79.0,79.0,233.0,77.666667
max,100.0,100.0,100.0,300.0,100.0
