In [2]:
!pip install pandas


Collecting pandas
  Downloading pandas-2.2.3-cp313-cp313-win_amd64.whl.metadata (19 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.3-cp313-cp313-win_amd64.whl (11.5 MB)
   ---------------------------------------- 0.0/11.5 MB ? eta -:--:--
   ---- ----------------------------------- 1.3/11.5 MB 8.8 MB/s eta 0:00:02
   ---------- ----------------------------- 3.1/11.5 MB 8.0 MB/s eta 0:00:02
   --------------- ------------------------ 4.5/11.5 MB 7.4 MB/s eta 0:00:01
   ------------------ --------------------- 5.2/11.5 MB 6.6 MB/s eta 0:00:01
   --------------------- ------------------ 6.0/11.5 MB 5.9 MB/s eta 0:00:01
   ------------------------ --------------- 7.1/11.5 MB 5.6 MB/s eta 0:00:01
   --------------------------- ------------ 7.9/11.5 MB 5.4 MB/s eta 0:00:01
   ------------------------------

In [3]:
import pandas as pd

# Create a simple dataset
data = {
    'Student_ID': [1, 2, 3, 4, 5, 6],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Frank'],
    'Math_Score': [88, 92, None, 70, 65, 105],   # Notice: Missing value (None) and 105 looks like an outlier
    'Science_Score': [90, 85, 95, 78, 60, 65],
    'English_Score': [85, None, 80, 70, 75, 85]   # Notice: Missing value
}

df = pd.DataFrame(data)
print(df)


   Student_ID     Name  Math_Score  Science_Score  English_Score
0           1    Alice        88.0             90           85.0
1           2      Bob        92.0             85            NaN
2           3  Charlie         NaN             95           80.0
3           4    David        70.0             78           70.0
4           5      Eva        65.0             60           75.0
5           6    Frank       105.0             65           85.0


In [5]:
# Check for missing values
print("\nMissing Values:\n")
print(df.isnull().sum())

# Simple way to fill missing values
# We'll fill missing Math and English scores with the mean
df['Math_Score'].fillna(df['Math_Score'].mean())
df['English_Score'].fillna(df['English_Score'].mean())

print("\nAfter filling missing values:\n")
print(df)



Missing Values:

Student_ID       0
Name             0
Math_Score       0
Science_Score    0
English_Score    0
dtype: int64

After filling missing values:

   Student_ID     Name  Math_Score  Science_Score  English_Score
0           1    Alice        88.0             90           85.0
1           2      Bob        92.0             85           79.0
2           3  Charlie        84.0             95           80.0
3           4    David        70.0             78           70.0
4           5      Eva        65.0             60           75.0
5           6    Frank       105.0             65           85.0


In [7]:
# Step 5: Detect outliers using IQR method
Q1 = df['Math_Score'].quantile(0.25)
Q3 = df['Math_Score'].quantile(0.75)
IQR = Q3 - Q1

# Define lower and upper bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print("\nLower Bound:", lower_bound)
print("Upper Bound:", upper_bound)

# Step 6: Find outliers
outliers = df[(df['Math_Score'] < lower_bound) | (df['Math_Score'] > upper_bound)]
print("\nOutliers in Math_Score:")
print(outliers)

# Step 7: Handling outliers - Replace with median
median_math = df['Math_Score'].median()
df.loc[(df['Math_Score'] < lower_bound) | (df['Math_Score'] > upper_bound), 'Math_Score'] = median_math

print("\nDataset after handling outliers:")
print(df)



Lower Bound: 47.25
Upper Bound: 117.25

Outliers in Math_Score:
Empty DataFrame
Columns: [Student_ID, Name, Math_Score, Science_Score, English_Score]
Index: []

Dataset after handling outliers:
   Student_ID     Name  Math_Score  Science_Score  English_Score
0           1    Alice        88.0             90           85.0
1           2      Bob        92.0             85           79.0
2           3  Charlie        84.0             95           80.0
3           4    David        70.0             78           70.0
4           5      Eva        65.0             60           75.0
5           6    Frank       105.0             65           85.0


In [8]:
# Step 8: Apply Min-Max Scaling to Math_Score
min_value = df['Math_Score'].min()
max_value = df['Math_Score'].max()

df['Math_Score_Scaled'] = (df['Math_Score'] - min_value) / (max_value - min_value)

print("\nDataset after Min-Max Scaling of Math_Score:")
print(df)



Dataset after Min-Max Scaling of Math_Score:
   Student_ID     Name  Math_Score  Science_Score  English_Score  \
0           1    Alice        88.0             90           85.0   
1           2      Bob        92.0             85           79.0   
2           3  Charlie        84.0             95           80.0   
3           4    David        70.0             78           70.0   
4           5      Eva        65.0             60           75.0   
5           6    Frank       105.0             65           85.0   

   Math_Score_Scaled  
0              0.575  
1              0.675  
2              0.475  
3              0.125  
4              0.000  
5              1.000  
