In [4]:
#IMPORT BASIC LIBRARIES
import pandas as pd
import numpy as np

In [5]:
#LOAD DATA SET 
data = {
    "Name":["Ravi", "sneha", "Amit", "Priya", "Karan", "Neha"],
    "Math":[85,90, np.nan, 78, 92, 88],
    "Science":[80,np.nan,75,85,95,90],
    "City":["Delhi","Mumbai", "Delhi","Jaipur",np.nan,"Delhi"]
}
df = pd.DataFrame(data)
print("Original Datadet:\n")
print(df)
            
    

Original Datadet:

    Name  Math  Science    City
0   Ravi  85.0     80.0   Delhi
1  sneha  90.0      NaN  Mumbai
2   Amit   NaN     75.0   Delhi
3  Priya  78.0     85.0  Jaipur
4  Karan  92.0     95.0     NaN
5   Neha  88.0     90.0   Delhi


In [6]:
#CHECK MISSING VALUES
print("\nMissing values count:\n")
print(df.isnull().sum())


Missing values count:

Name       0
Math       1
Science    1
City       1
dtype: int64


In [7]:
#FILL MISSING VALUES 
df["Math"].fillna(df["Math"].mean(),inplace = True)
df["Science"].fillna(df["Science"].median(),inplace = True)
df["City"].fillna("Unknown",inplace = True)
print("\nAfter missing values:\n")
print(df)
      


After missing values:

    Name  Math  Science     City
0   Ravi  85.0     80.0    Delhi
1  sneha  90.0     85.0   Mumbai
2   Amit  86.6     75.0    Delhi
3  Priya  78.0     85.0   Jaipur
4  Karan  92.0     95.0  Unknown
5   Neha  88.0     90.0    Delhi


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Math"].fillna(df["Math"].mean(),inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Science"].fillna(df["Science"].median(),inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we 

In [8]:
#Formatting Column
df.columns = df.columns.str.strip().str.lower()
print("\nFormatted columns:\n", df.columns)


Formatted columns:
 Index(['name', 'math', 'science', 'city'], dtype='object')


In [9]:
#Encoding Categorical Data
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df["city_encoded"] = encoder.fit_transform(df["city"])
print("\nEncoded City Column:\n")
print(df[["city","city_encoded"]])


Encoded City Column:

      city  city_encoded
0    Delhi             0
1   Mumbai             2
2    Delhi             0
3   Jaipur             1
4  Unknown             3
5    Delhi             0


In [10]:
#Normalization
#To improve model performance that we normalize numerical values
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df[["math_scaled", "science_scaled"]] = scaler.fit_transform(df[["math", "science"]])
print("\n After Normalization:\n")
print(df)
                                                             


 After Normalization:

    name  math  science     city  city_encoded  math_scaled  science_scaled
0   Ravi  85.0     80.0    Delhi             0     0.500000            0.25
1  sneha  90.0     85.0   Mumbai             2     0.857143            0.50
2   Amit  86.6     75.0    Delhi             0     0.614286            0.00
3  Priya  78.0     85.0   Jaipur             1     0.000000            0.50
4  Karan  92.0     95.0  Unknown             3     1.000000            1.00
5   Neha  88.0     90.0    Delhi             0     0.714286            0.75


In [11]:
#Detecting Outliers
#If some values are very different then we calls it outlier, It confuse ML models
Q1 = df["math"].quantile(0.25)
Q3 = df["math"].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5*IQR
upper = Q3 - 1.5*IQR
outliers = df[(df["math"] < lower) | (df["math"]>upper)]
print("\nOutliers in math:\n", outliers)


Outliers in math:
     name  math  science     city  city_encoded  math_scaled  science_scaled
0   Ravi  85.0     80.0    Delhi             0     0.500000            0.25
1  sneha  90.0     85.0   Mumbai             2     0.857143            0.50
2   Amit  86.6     75.0    Delhi             0     0.614286            0.00
3  Priya  78.0     85.0   Jaipur             1     0.000000            0.50
4  Karan  92.0     95.0  Unknown             3     1.000000            1.00
5   Neha  88.0     90.0    Delhi             0     0.714286            0.75


In [12]:
#Final cleaned Data set
print("\n Final cleaned dataset:\n")
print(df)


 Final cleaned dataset:

    name  math  science     city  city_encoded  math_scaled  science_scaled
0   Ravi  85.0     80.0    Delhi             0     0.500000            0.25
1  sneha  90.0     85.0   Mumbai             2     0.857143            0.50
2   Amit  86.6     75.0    Delhi             0     0.614286            0.00
3  Priya  78.0     85.0   Jaipur             1     0.000000            0.50
4  Karan  92.0     95.0  Unknown             3     1.000000            1.00
5   Neha  88.0     90.0    Delhi             0     0.714286            0.75
