In [2]:
import pandas as pd
df = pd.read_csv('power_gen_data.csv')
df

Unnamed: 0,Station_ID,Station_Name,Region,Date,Energy_MWh,Status,Operator,Peak_Load_MW
0,PS001,Alpha Station,North,2025-07-01,1250.0,Online,GridCorp,320.0
1,PS002,Beta Station,South,2025-07-01,980.0,Maintenance,PowerGen,280.0
2,PS003,Gamma Station,East,2025-07-01,1125.0,Online,GridCorp,310.0
3,PS004,Delta Station,West,2025-07-01,870.0,,PowerGen,
4,PS005,Epsilon Station,North,2025-07-01,1340.0,Online,GridCorp,330.0
5,PS006,Zeta Station,South,2025-07-01,,Online,PowerGen,250.0
6,PS007,Theta Station,East,2025-07-01,0.0,Offline,GridCorp,0.0
7,PS008,Iota Station,West,2025-07-01,1190.0,Online,,300.0
8,PS003,Gamma Station,East,2025-07-01,1125.0,Online,GridCorp,310.0
9,PS004,Delta Station,West,2025-07-01,870.0,,PowerGen,


In [3]:
# 1. Select Key Columns for Reporting
df[['Station_Name', 'Region', 'Energy_MWh', 'Status']]

Unnamed: 0,Station_Name,Region,Energy_MWh,Status
0,Alpha Station,North,1250.0,Online
1,Beta Station,South,980.0,Maintenance
2,Gamma Station,East,1125.0,Online
3,Delta Station,West,870.0,
4,Epsilon Station,North,1340.0,Online
5,Zeta Station,South,,Online
6,Theta Station,East,0.0,Offline
7,Iota Station,West,1190.0,Online
8,Gamma Station,East,1125.0,Online
9,Delta Station,West,870.0,


In [4]:
# 2. Filter by Region and Status
df.query("Status == 'Online' and Region == 'West'")

Unnamed: 0,Station_ID,Station_Name,Region,Date,Energy_MWh,Status,Operator,Peak_Load_MW
7,PS008,Iota Station,West,2025-07-01,1190.0,Online,,300.0


In [5]:
# 3. Detect Zero-Energy Online Stations
# Task: Identify stations with Status == "Online" but Energy_MWh == 0.

df.query("Status == 'Online' and Energy_MWh == 0")

Unnamed: 0,Station_ID,Station_Name,Region,Date,Energy_MWh,Status,Operator,Peak_Load_MW


In [6]:
# 4. Filter with .query()
# Task: Use SQL-like syntax to get all PowerGen stations that produced more than 1000 MWh.

df.query("Energy_MWh > 1000")

Unnamed: 0,Station_ID,Station_Name,Region,Date,Energy_MWh,Status,Operator,Peak_Load_MW
0,PS001,Alpha Station,North,2025-07-01,1250.0,Online,GridCorp,320.0
2,PS003,Gamma Station,East,2025-07-01,1125.0,Online,GridCorp,310.0
4,PS005,Epsilon Station,North,2025-07-01,1340.0,Online,GridCorp,330.0
7,PS008,Iota Station,West,2025-07-01,1190.0,Online,,300.0
8,PS003,Gamma Station,East,2025-07-01,1125.0,Online,GridCorp,310.0


In [7]:
# 5. Select by Position and Label
# Task: Use .iloc[] to retrieve the 2nd–4th rows, and .loc[] to get Alpha Station’s full row.

print(df.iloc[1:4])
df[df.loc[:, 'Station_Name'] == 'Alpha Station']

  Station_ID   Station_Name Region        Date  Energy_MWh       Status  \
1      PS002   Beta Station  South  2025-07-01       980.0  Maintenance   
2      PS003  Gamma Station   East  2025-07-01      1125.0       Online   
3      PS004  Delta Station   West  2025-07-01       870.0          NaN   

   Operator  Peak_Load_MW  
1  PowerGen         280.0  
2  GridCorp         310.0  
3  PowerGen           NaN  


Unnamed: 0,Station_ID,Station_Name,Region,Date,Energy_MWh,Status,Operator,Peak_Load_MW
0,PS001,Alpha Station,North,2025-07-01,1250.0,Online,GridCorp,320.0


In [8]:
# 6. Filter Using .isin()
# Task: Get all stations in either the South or East region.

df[df['Region'].isin(['South', 'East'])]

Unnamed: 0,Station_ID,Station_Name,Region,Date,Energy_MWh,Status,Operator,Peak_Load_MW
1,PS002,Beta Station,South,2025-07-01,980.0,Maintenance,PowerGen,280.0
2,PS003,Gamma Station,East,2025-07-01,1125.0,Online,GridCorp,310.0
5,PS006,Zeta Station,South,2025-07-01,,Online,PowerGen,250.0
6,PS007,Theta Station,East,2025-07-01,0.0,Offline,GridCorp,0.0
8,PS003,Gamma Station,East,2025-07-01,1125.0,Online,GridCorp,310.0


In [9]:
# 7. Find Stations with Missing Peak Load
# Task: Select all rows where Peak_Load_MW is missing.

df[pd.isnull(df['Peak_Load_MW'])]

Unnamed: 0,Station_ID,Station_Name,Region,Date,Energy_MWh,Status,Operator,Peak_Load_MW
3,PS004,Delta Station,West,2025-07-01,870.0,,PowerGen,
9,PS004,Delta Station,West,2025-07-01,870.0,,PowerGen,


In [10]:
# 8. Filter by Efficiency (New Column)
# Task: First create an Efficiency column, then select stations with efficiency > 4.0.

df['Efficiency'] = df['Energy_MWh'] / df['Peak_Load_MW']
df.query("Efficiency > 4")

Unnamed: 0,Station_ID,Station_Name,Region,Date,Energy_MWh,Status,Operator,Peak_Load_MW,Efficiency
4,PS005,Epsilon Station,North,2025-07-01,1340.0,Online,GridCorp,330.0,4.060606


In [11]:
# 9. Detect Duplicates by Station_ID
# Task: Find all rows that appear as duplicates based on Station_ID.

df[df.duplicated('Station_ID', keep = False)]

Unnamed: 0,Station_ID,Station_Name,Region,Date,Energy_MWh,Status,Operator,Peak_Load_MW,Efficiency
2,PS003,Gamma Station,East,2025-07-01,1125.0,Online,GridCorp,310.0,3.629032
3,PS004,Delta Station,West,2025-07-01,870.0,,PowerGen,,
8,PS003,Gamma Station,East,2025-07-01,1125.0,Online,GridCorp,310.0,3.629032
9,PS004,Delta Station,West,2025-07-01,870.0,,PowerGen,,


In [12]:
# 10. Text-Based Filter on Station Names
# Task: Select all stations whose names contain the letter “a” (case-insensitive).

df[df['Station_Name'].str.contains('a', 'A')]

Unnamed: 0,Station_ID,Station_Name,Region,Date,Energy_MWh,Status,Operator,Peak_Load_MW,Efficiency
0,PS001,Alpha Station,North,2025-07-01,1250.0,Online,GridCorp,320.0,3.90625
1,PS002,Beta Station,South,2025-07-01,980.0,Maintenance,PowerGen,280.0,3.5
2,PS003,Gamma Station,East,2025-07-01,1125.0,Online,GridCorp,310.0,3.629032
3,PS004,Delta Station,West,2025-07-01,870.0,,PowerGen,,
4,PS005,Epsilon Station,North,2025-07-01,1340.0,Online,GridCorp,330.0,4.060606
5,PS006,Zeta Station,South,2025-07-01,,Online,PowerGen,250.0,
6,PS007,Theta Station,East,2025-07-01,0.0,Offline,GridCorp,0.0,
7,PS008,Iota Station,West,2025-07-01,1190.0,Online,,300.0,3.966667
8,PS003,Gamma Station,East,2025-07-01,1125.0,Online,GridCorp,310.0,3.629032
9,PS004,Delta Station,West,2025-07-01,870.0,,PowerGen,,


In [13]:
# 11. Select Stations with Null or Blank Operator
# Task: Filter all stations where Operator is null or empty.

df[df['Operator'].isna()]

Unnamed: 0,Station_ID,Station_Name,Region,Date,Energy_MWh,Status,Operator,Peak_Load_MW,Efficiency
7,PS008,Iota Station,West,2025-07-01,1190.0,Online,,300.0,3.966667


In [14]:
# 12. Filter Stations Without an Assigned Status
# Task: Identify all stations where the Status field is missing (NaN) or blank.

df[df['Status'].isna()]

Unnamed: 0,Station_ID,Station_Name,Region,Date,Energy_MWh,Status,Operator,Peak_Load_MW,Efficiency
3,PS004,Delta Station,West,2025-07-01,870.0,,PowerGen,,
9,PS004,Delta Station,West,2025-07-01,870.0,,PowerGen,,


In [15]:
# 13. Select High-Energy Output Stations in East or North
# Task: Get all stations in the East or North with Energy_MWh > 1200.

df[(df['Energy_MWh'] > 1200) & df['Region'].isin(['East', 'North'])]

Unnamed: 0,Station_ID,Station_Name,Region,Date,Energy_MWh,Status,Operator,Peak_Load_MW,Efficiency
0,PS001,Alpha Station,North,2025-07-01,1250.0,Online,GridCorp,320.0,3.90625
4,PS005,Epsilon Station,North,2025-07-01,1340.0,Online,GridCorp,330.0,4.060606


In [16]:
# 14. Find Stations with the Word “Station” Missing in Name
# Task: Detect any rows where 'Station_Name' does not contain 'Station'.

df[df['Station_Name'].str.contains('Station')==False]

Unnamed: 0,Station_ID,Station_Name,Region,Date,Energy_MWh,Status,Operator,Peak_Load_MW,Efficiency


In [17]:
# 15. Filter by Partial Match in Operator
# Task: Select all rows where 'Operator' contains 'Grid' (case insensitive)

df[df['Operator'].str.contains("Grid", na=False, case=False)]

Unnamed: 0,Station_ID,Station_Name,Region,Date,Energy_MWh,Status,Operator,Peak_Load_MW,Efficiency
0,PS001,Alpha Station,North,2025-07-01,1250.0,Online,GridCorp,320.0,3.90625
2,PS003,Gamma Station,East,2025-07-01,1125.0,Online,GridCorp,310.0,3.629032
4,PS005,Epsilon Station,North,2025-07-01,1340.0,Online,GridCorp,330.0,4.060606
6,PS007,Theta Station,East,2025-07-01,0.0,Offline,GridCorp,0.0,
8,PS003,Gamma Station,East,2025-07-01,1125.0,Online,GridCorp,310.0,3.629032


In [18]:
# 16. Slice Last 3 Rows Using .iloc[]
# Task: Select the final 3 rows of the DataFrame.

df.iloc[-3:]

Unnamed: 0,Station_ID,Station_Name,Region,Date,Energy_MWh,Status,Operator,Peak_Load_MW,Efficiency
7,PS008,Iota Station,West,2025-07-01,1190.0,Online,,300.0,3.966667
8,PS003,Gamma Station,East,2025-07-01,1125.0,Online,GridCorp,310.0,3.629032
9,PS004,Delta Station,West,2025-07-01,870.0,,PowerGen,,


In [19]:
# 17. Select Top 5 Stations by Energy Output
# Task: Return the top 5 highest energy-producing stations.

df.nlargest(5, 'Energy_MWh')

Unnamed: 0,Station_ID,Station_Name,Region,Date,Energy_MWh,Status,Operator,Peak_Load_MW,Efficiency
4,PS005,Epsilon Station,North,2025-07-01,1340.0,Online,GridCorp,330.0,4.060606
0,PS001,Alpha Station,North,2025-07-01,1250.0,Online,GridCorp,320.0,3.90625
7,PS008,Iota Station,West,2025-07-01,1190.0,Online,,300.0,3.966667
2,PS003,Gamma Station,East,2025-07-01,1125.0,Online,GridCorp,310.0,3.629032
8,PS003,Gamma Station,East,2025-07-01,1125.0,Online,GridCorp,310.0,3.629032


In [20]:
# 18. Select Stations with Efficiency in a Safe Band
# Task: First calculate Efficiency, then filter between 3.0 and 5.0.

df[df['Efficiency'].between(3,5)]

Unnamed: 0,Station_ID,Station_Name,Region,Date,Energy_MWh,Status,Operator,Peak_Load_MW,Efficiency
0,PS001,Alpha Station,North,2025-07-01,1250.0,Online,GridCorp,320.0,3.90625
1,PS002,Beta Station,South,2025-07-01,980.0,Maintenance,PowerGen,280.0,3.5
2,PS003,Gamma Station,East,2025-07-01,1125.0,Online,GridCorp,310.0,3.629032
4,PS005,Epsilon Station,North,2025-07-01,1340.0,Online,GridCorp,330.0,4.060606
7,PS008,Iota Station,West,2025-07-01,1190.0,Online,,300.0,3.966667
8,PS003,Gamma Station,East,2025-07-01,1125.0,Online,GridCorp,310.0,3.629032


In [21]:
# 19. Multi-Column Condition with np.where
# Task: Flag "Under Maintenance" where Status is "Maintenance" and energy < 1000.

# Method 1
# import numpy as np
# df['Alert'] = np.where((df['Status'] == 'Maintenance') & (df['Energy_MWh'] < 1000)), 'Under Maintenance', '')
# df

# Method 2
df.loc[((df['Status'] == 'Maintenance') & (df['Energy_MWh'] < 1000)), 'Status'] = 'Under Maintenance'
df

Unnamed: 0,Station_ID,Station_Name,Region,Date,Energy_MWh,Status,Operator,Peak_Load_MW,Efficiency
0,PS001,Alpha Station,North,2025-07-01,1250.0,Online,GridCorp,320.0,3.90625
1,PS002,Beta Station,South,2025-07-01,980.0,Under Maintenance,PowerGen,280.0,3.5
2,PS003,Gamma Station,East,2025-07-01,1125.0,Online,GridCorp,310.0,3.629032
3,PS004,Delta Station,West,2025-07-01,870.0,,PowerGen,,
4,PS005,Epsilon Station,North,2025-07-01,1340.0,Online,GridCorp,330.0,4.060606
5,PS006,Zeta Station,South,2025-07-01,,Online,PowerGen,250.0,
6,PS007,Theta Station,East,2025-07-01,0.0,Offline,GridCorp,0.0,
7,PS008,Iota Station,West,2025-07-01,1190.0,Online,,300.0,3.966667
8,PS003,Gamma Station,East,2025-07-01,1125.0,Online,GridCorp,310.0,3.629032
9,PS004,Delta Station,West,2025-07-01,870.0,,PowerGen,,


In [30]:
# 20. Select Columns Dynamically Using .filter()
# Task: Select all columns that contain "Load" or "Energy" in their name.

df.filter(regex='Load|Energy')

Unnamed: 0,Energy_MWh,Peak_Load_MW
0,1250.0,320.0
1,980.0,280.0
2,1125.0,310.0
3,870.0,
4,1340.0,330.0
5,,250.0
6,0.0,0.0
7,1190.0,300.0
8,1125.0,310.0
9,870.0,


In [24]:
# 21. Filter Stations with Null or Blank IDs (Critical Validation)
# Task: Identify rows with missing or malformed Station_ID.

df[df['Station_ID'].isnull()]

Unnamed: 0,Station_ID,Station_Name,Region,Date,Energy_MWh,Status,Operator,Peak_Load_MW,Efficiency
