Generating Data

In [None]:
import pandas as pd
import numpy as np

# Scenario: Employee records with some missing data (NaN)
data = {
    "Name": ["Sara", "Omar", "Lina", "Sami", "Ali", "Noor"],
    "Age": [22, 25, 23, 24, 30, 22],
    "Department": ["IT", "HR", "IT", "Marketing", "IT", "HR"],
    # Added np.nan to simulate dirty data
    "Salary": [2000, 2300, 2100, 1900, np.nan, np.nan] 
}

df = pd.DataFrame(data)

print("--- Original DataFrame ---")
display(df)  # In notebooks, 'display' is prettier than print

Exploration

In [None]:
print("--- 1. Top & Bottom Rows ---")
display(df.head(2)) # First 2 rows
display(df.tail(2)) # Last 2 rows

print("\n--- 2. Dataset Info ---")
# info() is critical to check for Non-Null counts and Data Types
df.info() 

print("\n--- 3. Statistical Summary ---")
display(df.describe())

Selection

In [None]:
# A. Select a single column (Returns a Series)
salaries = df["Salary"]

# B. Select by Label (.loc)
# Selecting row 0 and specific columns
row_label = df.loc[0, ["Name", "Department"]]

# C. Select by Position (.iloc)
# Rows 1 to 3 (exclusive), Columns 0 to 2
subset = df.iloc[1:3, 0:2]

print("--- Subset Selection (.iloc) ---")
display(subset)

Filtering & Cleaning

In [None]:
# 1. Conditional Filtering: Employees older than 23
senior_employees = df[ df["Age"] > 23 ]

print("--- Employees Older than 23 ---")
display(senior_employees)

# 2. Handling Missing Values (NaN)
print("\n--- Cleaning Data ---")

# Check rows with missing salary before dropping
missing = df[df["Salary"].isna()]
print(f"Found {len(missing)} rows with missing salary.")

# Option A: Drop rows with missing Salary
df_clean = df.dropna(subset=["Salary"])

# Option B: Or keep only rows where Salary is NOT NaN (Same result)
# df_clean = df[df["Salary"].notna()]

print("--- Final Cleaned DataFrame ---")
display(df_clean)