In [1]:
import numpy as np
import pandas as pd

# Data Filtering

# 1. Filtering with Basic Conditions
Basic conditional filtering allows you to create subsets by setting conditions on columns.

Example:
In a sales dataset, you may want to filter only the rows where Revenue is above a certain amount.

In [None]:
# Sample DataFrame
data = {
    "Product": ["A", "B", "C", "D"],
    "Revenue": [500, 1500, 200, 2500]
}
df = pd.DataFrame(data)

In [None]:
# Filtering for Revenue above 1000
high_revenue = df[df["Revenue"] > 1000]
print(high_revenue)

# 2. Filtering with Multiple Conditions (AND / OR)
Multiple conditions allow you to filter with logical operators, such as & (AND) and | (OR).

Example:
For a dataset on employees, you might want to filter employees who are from a specific department and have been with the company for more than five years.

In [None]:
data = {
    "EmployeeID": [101, 102, 103, 104],
    "Department": ["Sales", "IT", "Sales", "HR"],
    "YearsOfService": [2, 6, 7, 3]
}
df = pd.DataFrame(data)

In [None]:
# Filter for Sales department with more than 5 years of service
senior_sales = df[(df["Department"] == "Sales") & (df["YearsOfService"] > 5)]
print(senior_sales)

# 3. Filtering Using .query()
The .query() method allows filtering with simple, SQL-like syntax, making it very readable.

Example:
In a restaurant's dataset, you might want to filter orders for customers who ordered Spaghetti and paid more than $20.

In [None]:
data = {
    "OrderID": [1, 2, 3, 4],
    "Item": ["Pizza", "Spaghetti", "Spaghetti", "Salad"],
    "Amount": [15, 22, 18, 12]
}
df = pd.DataFrame(data)

In [None]:
# Filter with query
spaghetti_orders = df.query("Item == 'Spaghetti' and Amount > 20")
print(spaghetti_orders)

# 4. Filtering Using .isin()
The .isin() method filters rows where a column's value is in a specified list, useful for multiple matches.

Example:
In a retail inventory dataset, you may want to filter for specific product categories to analyze stock levels

In [2]:
data = {
    "ProductID": [1, 2, 3, 4],
    "Category": ["Electronics", "Furniture", "Electronics", "Clothing"]
}
df = pd.DataFrame(data)

# Filter for Electronics and Furniture categories
selected_categories = df[df["Category"].isin(["Electronics", "Furniture"])]
print(selected_categories)


   ProductID     Category
0          1  Electronics
1          2    Furniture
2          3  Electronics


# 5. Filtering Using String Methods
Pandas allows you to filter based on partial string matches with .str.contains() and .str.startswith().

Example:
In a dataset of book titles, you might want to filter for all titles containing a specific keyword, like “Python.”

In [4]:
data = {
    "BookID": [1, 2, 3, 4],
    "Title": ["Learning Python", "Data Science 101", "Python for Beginners", "Machine Learning"]
}
df = pd.DataFrame(data)

In [7]:
# Filter for titles containing "Python"
python_books = df[df["Title"].str.contains("Python", case=False)]
print(python_books)

   BookID                 Title
0       1       Learning Python
2       3  Python for Beginners


# 6. Filtering with .between()
The .between() method is useful for filtering values within a specific range.

Example:
For an educational dataset, filter students within a certain age range, like 18 to 22 years.

In [10]:
data = {
    "StudentID": [1, 2, 3, 4, 5, 6],
    "Age": [18, 17, 19, 22, 24, 20]
}
df = pd.DataFrame(data)

In [11]:
# Filter for students aged between 18 and 22
college_age_students = df[df["Age"].between(18, 22)]
print(college_age_students)

   StudentID  Age
0          1   18
2          3   19
3          4   22
5          6   20


# 7. Filtering with .apply()
With .apply(), you can apply custom functions to filter based on complex conditions.

Example:
In a dataset of employee salaries, filter employees who earn above the mean salary.

In [None]:
data = {
    "EmployeeID": [101, 102, 103, 104],
    "Salary": [50000, 70000, 45000, 60000]
}
df = pd.DataFrame(data)
print(df)

In [None]:
# Define mean salary
mean_salary = np.mean(df["Salary"])
print(mean_salary)

In [None]:
# Filter employees earning above mean salary
high_earners = df[df["Salary"].apply(lambda x: x > mean_salary)]
print(high_earners)

# 8. Filtering Missing Values Using .isna() and .notna()
You can use .isna() or .notna() to filter rows with or without missing values.

In [None]:
data = {
    "RespondentID": [1, 2, 3],
    "Name": ["Alice", "Bob", None],
    "Contact": [None, "bob@gmail.com", "charlie@yahoo.com"]
}
df = pd.DataFrame(data)

In [None]:
# Filter rows where 'Contact' is missing
missing_contact = df[df["Contact"].isna()]
print(missing_contact)

# 9. Filtering Using .nlargest() and .nsmallest()
Use .nlargest() and .nsmallest() to get the top or bottom N values.

Example:
In a sports analytics dataset, find the top 3 players based on scores.

In [None]:
data = {
    "PlayerID": [1, 2, 3, 4],
    "Score": [88, 92, 95, 80]
}
df = pd.DataFrame(data)

In [None]:
# Top 3 scores
top_scorers = df.nlargest(3, "Score")
print(top_scorers)