In [1]:
import pandas as pd
import numpy as np

# 1. CREATE DIRTY DATA
# We use np.nan to represent "Missing Data"
data = {
    "Name": ["Amit", "Neha", "Rahul", "Priya", "Ankit", "Suresh"],
    "Score": [85, 92, np.nan, 95, np.nan, 88],  # 2 Missing Scores
    "Attendance": [90, np.nan, 85, 98, 92, np.nan], # 2 Missing Attendance
    "City": ["Mumbai", "Delhi", "Mumbai", np.nan, "Delhi", "Mumbai"] # 1 Missing City
}

df = pd.DataFrame(data)

print("--- 1. The Dirty Data ---")
print(df)

# 2. DETECT THE HOLES
# This is the most used command in Data Science
print("\n--- 2. Missing Value Count ---")
print(df.isnull().sum())

# 3. FIXING NUMBERS (Imputation)
# Logic: If Score is missing, assume they got the "Average" score.
avg_score = df['Score'].mean()
df['Score'] = df['Score'].fillna(avg_score)

# Logic: If Attendance is missing, assume 0 (Strict Professor approach)
df['Attendance'] = df['Attendance'].fillna(0)

# 4. FIXING TEXT (Categorical)
# Logic: If City is missing, assume the most common city (Mode)
most_common_city = df['City'].mode()[0]
df['City'] = df['City'].fillna(most_common_city)

print("\n--- 3. The Cleaned Data ---")
print(df)

--- 1. The Dirty Data ---
     Name  Score  Attendance    City
0    Amit   85.0        90.0  Mumbai
1    Neha   92.0         NaN   Delhi
2   Rahul    NaN        85.0  Mumbai
3   Priya   95.0        98.0     NaN
4   Ankit    NaN        92.0   Delhi
5  Suresh   88.0         NaN  Mumbai

--- 2. Missing Value Count ---
Name          0
Score         2
Attendance    2
City          1
dtype: int64

--- 3. The Cleaned Data ---
     Name  Score  Attendance    City
0    Amit   85.0        90.0  Mumbai
1    Neha   92.0         0.0   Delhi
2   Rahul   90.0        85.0  Mumbai
3   Priya   95.0        98.0  Mumbai
4   Ankit   90.0        92.0   Delhi
5  Suresh   88.0         0.0  Mumbai


In [8]:
import pandas as pd
import numpy as np

# 1. Initialize data with missing values (np.nan)
data = {
    "product": ["apple", "banana", "orange"],
    "price": [np.nan, 0.99, 1.49],
    "quantity": [10, 5, np.nan]
}

# 2. Create DataFrame
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
print("-" * 20)

# 3. Data Cleaning
# Fill NaN values in 'quantity' column with 0
df['quantity'] = df['quantity'].fillna(0)

# Drop rows where 'price' is missing (NaN)
df_clean = df.dropna(subset=['price'])

print("Cleaned DataFrame:")
print(df_clean)




Original DataFrame:
  product  price  quantity
0   apple    NaN      10.0
1  banana   0.99       5.0
2  orange   1.49       NaN
--------------------
Cleaned DataFrame:
  product  price  quantity
1  banana   0.99       5.0
2  orange   1.49       0.0
