Step 1: Import Required Libraries

In [2]:
import pandas as pd
import numpy as np


Step 2: Load the Dataset

In [None]:
df = pd.read_csv('students_scores.csv')
print(df.head())  #prnts the first 5 rows of thedata
print(df.tail())  #prints the last 5 rows of the data


Step 3: Inspect the Data

In [None]:
print(df.info())    #shows data types and the number of non-null values.
print(df.describe()) #gives summary stats (mean, std, min, max) for numeric columns.
print(df.columns)   #lists the column names.


Step 4: Handling Missing Values

In [None]:
#Check for missing values

In [None]:
print(df.isnull().sum())  #returns True for missing values   #adds up how many NaNs are in each column.


In [None]:
#Drop rows with missing values (if very few)

In [None]:
df = df.dropna()


#OR fill missing values (if many missing)

In [None]:
df['Math'] = df['Math'].fillna(df['Math'].mean())  #replaces NaNs with the column mean (can also use median, mode, etc.).


Step 5: Convert Data Types

In [None]:
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')  # Convert to numeric
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')  # Convert to datetime

#errors='coerce' turns bad data into NaN, which we can fix later.

Step 6: Remove Duplicates


In [None]:
df = df.drop_duplicates()  #Drops any repeated rows in the dataset.


In [None]:
Step 7: Clean Text Columns

In [None]:
df['Name'] = df['Name'].str.strip()  #  to remove extra whitespace.
df['Gender'] = df['Gender'].str.title()  # makes the first letter uppercase for consistency.



 Step 8: Rename Columns (if needed)

In [None]:
df.rename(columns={'Math Score': 'Math', 'Eng Score': 'English'}, inplace=True)
#Renames columns to simpler or more consistent names.

Step 9: Handle Outliers

In [None]:
q1 = df['Math'].quantile(0.25)
q3 = df['Math'].quantile(0.75)
iqr = q3 - q1

lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

df = df[(df['Math'] >= lower_bound) & (df['Math'] <= upper_bound)]

#This removes outliers using the Interquartile Range (IQR) method.



Step 10: Save the Cleaned Data

In [None]:
df.to_csv('cleaned_students_scores.csv', index=False)   #Saves the cleaned dataset to a new file.
