In [2]:
import pandas as pd

df = pd.read_csv('StudentsPerformance.csv')


In [3]:
df

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


In [4]:
print(df.shape)
print(df.columns)
print(df.info())
print(df.head())


(1000, 8)
Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course', 'math score', 'reading score',
       'writing score'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB
None
   gender race/ethnicity parental level of education         lunch  \
0 

In [6]:
# Normalize Column Names (remove spaces, lowercase)
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')


In [8]:
#❓ 4. Check for Missing Values
print(df.isnull().sum())


gender                         0
race/ethnicity                 0
parental_level_of_education    0
lunch                          0
test_preparation_course        0
math_score                     0
reading_score                  0
writing_score                  0
dtype: int64


In [9]:
#➡️ This dataset usually has no missing values, but if any introduced:
# Example: Fill missing scores with the column mean
# df['math_score'].fillna(df['math_score'].mean(), inplace=True)
# df['reading_score'].fillna(df['reading_score'].mean(), inplace=True)
# df['writing_score'].fillna(df['writing_score'].mean(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['math_score'].fillna(df['math_score'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['reading_score'].fillna(df['reading_score'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate

In [10]:
#Let’s check unique values in gender, race/ethnicity, and others:
print(df['gender'].unique())
print(df['race/ethnicity'].unique())
print(df['parental_level_of_education'].unique())


['female' 'male']
['group B' 'group C' 'group A' 'group D' 'group E']
["bachelor's degree" 'some college' "master's degree" "associate's degree"
 'high school' 'some high school']


In [11]:
#🛠️ Clean inconsistencies (e.g., case sensitivity or typos):
df['gender'] = df['gender'].str.strip().str.lower().replace({'female': 'Female', 'male': 'Male'})

# Capitalize parental education for consistency
df['parental_level_of_education'] = df['parental_level_of_education'].str.title()


In [12]:
# 🔢  Convert Score Columns to Numeric
df['math_score'] = pd.to_numeric(df['math_score'], errors='coerce')
df['reading_score'] = pd.to_numeric(df['reading_score'], errors='coerce')
df['writing_score'] = pd.to_numeric(df['writing_score'], errors='coerce')
# errors='coerce' tells pandas:

#     If it can't convert, replace the value with NaN (Not a Number).

In [13]:
df

Unnamed: 0,gender,race/ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,Female,group B,Bachelor'S Degree,standard,none,72,72,74
1,Female,group C,Some College,standard,completed,69,90,88
2,Female,group B,Master'S Degree,standard,none,90,95,93
3,Male,group A,Associate'S Degree,free/reduced,none,47,57,44
4,Male,group C,Some College,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,Female,group E,Master'S Degree,standard,completed,88,99,95
996,Male,group C,High School,free/reduced,none,62,55,55
997,Female,group C,High School,free/reduced,completed,59,71,65
998,Female,group D,Some College,standard,completed,68,78,77


In [14]:
#Add Total and Average Score Columns
df['total_score'] = df['math_score'] + df['reading_score'] + df['writing_score']
df['average_score'] = df['total_score'] / 3


In [15]:
df

Unnamed: 0,gender,race/ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score,total_score,average_score
0,Female,group B,Bachelor'S Degree,standard,none,72,72,74,218,72.666667
1,Female,group C,Some College,standard,completed,69,90,88,247,82.333333
2,Female,group B,Master'S Degree,standard,none,90,95,93,278,92.666667
3,Male,group A,Associate'S Degree,free/reduced,none,47,57,44,148,49.333333
4,Male,group C,Some College,standard,none,76,78,75,229,76.333333
...,...,...,...,...,...,...,...,...,...,...
995,Female,group E,Master'S Degree,standard,completed,88,99,95,282,94.000000
996,Male,group C,High School,free/reduced,none,62,55,55,172,57.333333
997,Female,group C,High School,free/reduced,completed,59,71,65,195,65.000000
998,Female,group D,Some College,standard,completed,68,78,77,223,74.333333


In [16]:
 # Final Check
print(df.isnull().sum())
print(df.describe())
print(df.dtypes)


gender                         0
race/ethnicity                 0
parental_level_of_education    0
lunch                          0
test_preparation_course        0
math_score                     0
reading_score                  0
writing_score                  0
total_score                    0
average_score                  0
dtype: int64
       math_score  reading_score  writing_score  total_score  average_score
count  1000.00000    1000.000000    1000.000000  1000.000000    1000.000000
mean     66.08900      69.169000      68.054000   203.312000      67.770667
std      15.16308      14.600192      15.195657    42.771978      14.257326
min       0.00000      17.000000      10.000000    27.000000       9.000000
25%      57.00000      59.000000      57.750000   175.000000      58.333333
50%      66.00000      70.000000      69.000000   205.000000      68.333333
75%      77.00000      79.000000      79.000000   233.000000      77.666667
max     100.00000     100.000000     100.000000  

In [17]:
df.to_csv('students_performance_cleaned.csv', index=False)
