In [4]:
import pandas as pd
df=pd.read_csv("data_cleaning_sample.csv")

<h1>Handling Missing Values</h1>

<h3>1. Check for Missing Data</h3>

In [5]:
df

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Alice,25.0,New York,F,alice@example.com,01-05-2021
1,Charlie,,Delhi,M,charlie@example,20-07-2021
2,Bob,30.0,Los Angeles,M,bob@example.com,15-06-2020
3,Charlie,,Delhi,M,charlie@example,20-07-2021
4,David,22.0,Mumbai,M,david@example.com,12-11-2019
5,,28.0,Delhi,F,eve@domain.com,
6,Alice,25.0,New York,F,alice@example.com,01-05-2021
7,Alice,25.0,New York,F,alice@example.com,01-05-2021
8,Charlie,,Delhi,M,charlie@example,20-07-2021


In [6]:
df.isnull()              # True for NaNs

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,False,False,False,False,False,False
1,False,True,False,False,False,False
2,False,False,False,False,False,False
3,False,True,False,False,False,False
4,False,False,False,False,False,False
5,True,False,False,False,False,True
6,False,False,False,False,False,False
7,False,False,False,False,False,False
8,False,True,False,False,False,False


In [7]:
df.isnull().sum()        # Count missing per column

Name         1
Age          3
City         0
Gender       0
Email        0
Join Date    1
dtype: int64

<h3>2. Drop Missing Data</h3>

In [8]:
df.dropna()              # Drop rows with *any* missing values

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Alice,25.0,New York,F,alice@example.com,01-05-2021
2,Bob,30.0,Los Angeles,M,bob@example.com,15-06-2020
4,David,22.0,Mumbai,M,david@example.com,12-11-2019
6,Alice,25.0,New York,F,alice@example.com,01-05-2021
7,Alice,25.0,New York,F,alice@example.com,01-05-2021


In [9]:
df.dropna(axis=1)        # Drop columns with missing values

Unnamed: 0,City,Gender,Email
0,New York,F,alice@example.com
1,Delhi,M,charlie@example
2,Los Angeles,M,bob@example.com
3,Delhi,M,charlie@example
4,Mumbai,M,david@example.com
5,Delhi,F,eve@domain.com
6,New York,F,alice@example.com
7,New York,F,alice@example.com
8,Delhi,M,charlie@example


<h3>3. Fill Missing Data</h3>

In [10]:
df.fillna(0)                     # Replace NaN with 0

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Alice,25.0,New York,F,alice@example.com,01-05-2021
1,Charlie,0.0,Delhi,M,charlie@example,20-07-2021
2,Bob,30.0,Los Angeles,M,bob@example.com,15-06-2020
3,Charlie,0.0,Delhi,M,charlie@example,20-07-2021
4,David,22.0,Mumbai,M,david@example.com,12-11-2019
5,0,28.0,Delhi,F,eve@domain.com,0
6,Alice,25.0,New York,F,alice@example.com,01-05-2021
7,Alice,25.0,New York,F,alice@example.com,01-05-2021
8,Charlie,0.0,Delhi,M,charlie@example,20-07-2021


In [11]:
df["Age"].fillna(df["Age"].mean())  # Replace with mean

0    25.000000
1    25.833333
2    30.000000
3    25.833333
4    22.000000
5    28.000000
6    25.000000
7    25.000000
8    25.833333
Name: Age, dtype: float64

In [12]:
df.ffill()      # Forward fill

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Alice,25.0,New York,F,alice@example.com,01-05-2021
1,Charlie,25.0,Delhi,M,charlie@example,20-07-2021
2,Bob,30.0,Los Angeles,M,bob@example.com,15-06-2020
3,Charlie,30.0,Delhi,M,charlie@example,20-07-2021
4,David,22.0,Mumbai,M,david@example.com,12-11-2019
5,David,28.0,Delhi,F,eve@domain.com,12-11-2019
6,Alice,25.0,New York,F,alice@example.com,01-05-2021
7,Alice,25.0,New York,F,alice@example.com,01-05-2021
8,Charlie,25.0,Delhi,M,charlie@example,20-07-2021


In [13]:
df.bfill()      # Backward fill

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Alice,25.0,New York,F,alice@example.com,01-05-2021
1,Charlie,30.0,Delhi,M,charlie@example,20-07-2021
2,Bob,30.0,Los Angeles,M,bob@example.com,15-06-2020
3,Charlie,22.0,Delhi,M,charlie@example,20-07-2021
4,David,22.0,Mumbai,M,david@example.com,12-11-2019
5,Alice,28.0,Delhi,F,eve@domain.com,01-05-2021
6,Alice,25.0,New York,F,alice@example.com,01-05-2021
7,Alice,25.0,New York,F,alice@example.com,01-05-2021
8,Charlie,,Delhi,M,charlie@example,20-07-2021


<h3>4. Detecting & Removing Duplicates</h3>

df.duplicated() returns a boolean Series where: True means that row is a duplicate of a previous row. False means it's the first occurrence (not a duplicate yet).

In [14]:
df.duplicated()          # True for duplicates

0    False
1    False
2    False
3     True
4    False
5    False
6     True
7     True
8     True
dtype: bool

In [36]:
df=df.drop_duplicates()     # Remove duplicate rows     subset=["col1","col2"]

In [16]:
df.duplicated(subset=["Name", "Age"])

0    False
1    False
2    False
3     True
4    False
5    False
6     True
7     True
8     True
dtype: bool

<h3>5. String Operations with .str</h3>

In [17]:
df["Name"].str.lower() # Converts all names to lowercase.

0      alice
1    charlie
2        bob
3    charlie
4      david
5        NaN
6      alice
7      alice
8    charlie
Name: Name, dtype: object

In [18]:
df["City"].str.contains("delhi", case=False) # Checks if 'delhi' is in the city name, case-insensitive.

0    False
1     True
2    False
3     True
4    False
5     True
6    False
7    False
8     True
Name: City, dtype: bool

In [23]:
df["Email"].str.split("@")     #df["Email"].str.split("@") # Outputs a pandas Series where each element is a list of strings (the split parts). This is where a Python list comes into play, but the outer object is still a pandas Series.

0    [alice, example.com]
1      [charlie, example]
2      [bob, example.com]
3      [charlie, example]
4    [david, example.com]
5       [eve, domain.com]
6    [alice, example.com]
7    [alice, example.com]
8      [charlie, example]
Name: Email, dtype: object

We can always chain methods like .str.strip().str.upper() for clean-up.



<h3>6. Type Conversions with .astype()</h3>

In [57]:
df2=df.dropna().copy()
df2["Age"] = df2["Age"].astype(int)
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, 0 to 4
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Name       3 non-null      object        
 1   Age        3 non-null      int32         
 2   City       3 non-null      object        
 3   Gender     3 non-null      object        
 4   Email      3 non-null      object        
 5   Join Date  3 non-null      datetime64[ns]
dtypes: datetime64[ns](1), int32(1), object(4)
memory usage: 156.0+ bytes


In [62]:
df2["Join Date"]=pd.to_datetime(df2["Join Date"])
df["Join Date"].dtype


dtype('<M8[ns]')

Why is pd.to_datetime() special?
Unlike astype(), which works on simple data types (like integers, strings, etc.), pd.to_datetime() is designed to:
<ul>
        <li>Handle different date formats (e.g., "YYYY-MM-DD", "MM/DD/YYYY", etc.).</li>
        <li>Handle mixed types (e.g., some date strings, some NaT, or missing values).</li>
        <li>Convert integer timestamps (e.g., UNIX time) into datetime objects.</li> 
        <li>Recognize timezones if provided.</li>
</ul>

<h3>7. Applying Functions</h3>
.apply() → Apply any function to rows or columns

In [66]:
df2["Age Group"] = df2["Age"].apply(lambda x: "Adult" if x >= 23 else "Minor")
df2

Unnamed: 0,Name,Age,City,Gender,Email,Join Date,Age Group
0,Alice,25,New York,F,alice@example.com,2021-05-01,Adult
2,Bob,30,Los Angeles,M,bob@example.com,2020-06-15,Adult
4,David,22,Mumbai,M,david@example.com,2019-11-12,Minor


.map() → Element-wise mapping for Series

In [69]:
gender_map = {"M": "Male", "F": "Female"}
df2["Gender"] = df2["Gender"].map(gender_map)
df

Unnamed: 0,Name,Age,City,Gender,Email,Join Date,Age Group
0,Alice,25.0,New York,Female,alice@example.com,2021-05-01,Adult
1,Charlie,,Delhi,Male,charlie@example,2021-07-20,Minor
2,Bob,30.0,Los Angeles,Male,bob@example.com,2020-06-15,Adult
4,David,22.0,Mumbai,Male,david@example.com,2019-11-12,Adult
5,,28.0,Delhi,Female,eve@domain.com,NaT,Adult


.replace() → Replace specific values

In [110]:
df3=pd.read_csv("data_cleaning_sample.csv")
df3["City"]=df3["City"].replace({"Delhi": "New Delhi", "Mumbai": "New Mumbai"})
df3

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Alice,25.0,New York,F,alice@example.com,01-05-2021
1,Charlie,,New Delhi,M,charlie@example,20-07-2021
2,Bob,30.0,Los Angeles,M,bob@example.com,15-06-2020
3,Charlie,,New Delhi,M,charlie@example,20-07-2021
4,David,22.0,New Mumbai,M,david@example.com,12-11-2019
5,,28.0,New Delhi,F,eve@domain.com,
6,Alice,25.0,New York,F,alice@example.com,01-05-2021
7,Alice,25.0,New York,F,alice@example.com,01-05-2021
8,Charlie,,New Delhi,M,charlie@example,20-07-2021


<h1>Summary</h1>
<ul>
    <li>Use isnull(), fillna(), dropna() for missing data</li>
    <li>Clean text with .str, convert types with .astype()</li>
    <li>Use apply(), map(), replace() to transform your columns</li>
</ul>



