####  Data Filtering

In [3]:
import pandas as pd

In [5]:
# Create the DataFrame 
data = {
    'name': ['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola'],
    'continent': ['Asia', 'Europe', 'Africa', 'Europe', 'Africa'],
    'area': [652230, 28748, 2381741, 468, 1246700],
    'population': [25500100, 2831741, 37100000, 78115, 20609294],
    'gdp': [20343000000, 12960000000, 188681000000, 3712000000, 100990000000]
}
world_df = pd.DataFrame(data)

In [7]:
world_df

Unnamed: 0,name,continent,area,population,gdp
0,Afghanistan,Asia,652230,25500100,20343000000
1,Albania,Europe,28748,2831741,12960000000
2,Algeria,Africa,2381741,37100000,188681000000
3,Andorra,Europe,468,78115,3712000000
4,Angola,Africa,1246700,20609294,100990000000


In [9]:
# Condition 1: area of at least 3,000,000 km^2
condition_area = world_df['area'] >= 3000000
# Condition 2: population of at least 25,000,000
condition_population = world_df['population'] >= 25000000

# Combine the conditions using OR operator
big_countries_df = world_df[condition_area | condition_population]

In [11]:
big_countries_df

Unnamed: 0,name,continent,area,population,gdp
0,Afghanistan,Asia,652230,25500100,20343000000
2,Algeria,Africa,2381741,37100000,188681000000


In [13]:
# Select the required columns: name, population, and area
result = big_countries_df[['name', 'population', 'area']]

In [19]:
# Print the result
print("Big Countries:")
print(result) 

Big Countries:
          name  population     area
0  Afghanistan    25500100   652230
2      Algeria    37100000  2381741


In [21]:
print("\n--- Additional Queries ---")


--- Additional Queries ---


In [28]:
# Query 1: Find countries in Africa
africa_countries = world_df[world_df['continent'] == 'Africa']  

# world_df['continent'] == 'Africa' creates a boolean Series (a column of True or False values). 
# For each row in world_df, it checks if the value in the 'continent' column is exactly 'Africa'. 
# If it is, the result for that row is True; otherwise, it's False.
 
# When you use this boolean Series inside the square brackets [] of a DataFrame (world_df[...]), pandas uses it as a filter. 
# It selects only those rows where the corresponding boolean value is True.
 
print("\nCountries in Africa:")
print(africa_countries[['name', 'population', 'area']])


Countries in Africa:
      name  population     area
2  Algeria    37100000  2381741
4   Angola    20609294  1246700


In [36]:
# Query 2: Calculate the average population and area
average_population = world_df['population'].mean()
average_area = world_df['area'].mean()
print(f"\nAverage Population: {average_population:,.0f}")   

# {average_population:,.0f}: This is an f-string expression that embeds the value of the average_population variable into the string.

print(f"Average Area: {average_area:,.0f} km^2")


Average Population: 17,223,850
Average Area: 861,977 km^2


In [38]:
# Query 3: Find countries with GDP greater than 50,000,000,000
rich_countries = world_df[world_df['gdp'] > 50000000000]
print("\nCountries with GDP > 50 Billion:")
print(rich_countries[['name', 'gdp']])


Countries with GDP > 50 Billion:
      name           gdp
2  Algeria  188681000000
4   Angola  100990000000


In [13]:
# Query 4: Sort countries by population in descending order
sorted_by_population = world_df.sort_values(by='population', ascending=False)

# world_df.sort_values(): This is a pandas DataFrame method used to sort the DataFrame by one or more columns.
# by='population': This argument specifies that the sorting should be performed based on the values in the 'population' column.

print("\nCountries sorted by Population (Descending):")
print(sorted_by_population[['name', 'population']])


Countries sorted by Population (Descending):
          name  population
2      Algeria    37100000
0  Afghanistan    25500100
4       Angola    20609294
1      Albania     2831741
3      Andorra       78115


In [17]:
# Query 5: Find the country with the largest area
largest_area_country = world_df.loc[world_df['area'].idxmax()]

# .idxmax(): This is a pandas Series method. When called on a Series (like world_df['area']), 
# it returns the index of the maximum value in that Series. So, it tells you the row label (index) where the largest area is located.
# world_df.loc[...]: This is pandas' label-based indexer. It's used to access a group of rows and columns by label(s) or a boolean array.

print("\nCountry with the Largest Area:")
print(largest_area_country[['name', 'area']])


Country with the Largest Area:
name    Algeria
area    2381741
Name: 2, dtype: object


------------------------------

##### Second table

In [5]:
customers_data = {
    'id': [1, 2, 3, 4],
    'name': ['Ashu', 'Hema', 'Sameer', 'Manish']
}
customers_df = pd.DataFrame(customers_data)

orders_data = {
    'id': [1, 2],
    'customerId': [3, 1]
}
orders_df = pd.DataFrame(orders_data)

In [7]:

print("Customers DataFrame:")
print(customers_df)
print("\nOrders DataFrame:")
print(orders_df)

Customers DataFrame:
   id    name
0   1    Ashu
1   2    Hema
2   3  Sameer
3   4  Manish

Orders DataFrame:
   id  customerId
0   1           3
1   2           1


##### --- Common Queries ---

In [31]:
# 1. Get all customers: 

print("\n1. All Customers:\n")
print(customers_df)


1. All Customers:

   id    name
0   1    Ashu
1   2    Hema
2   3  Sameer
3   4  Manish


In [33]:
# 2. Get all orders:
print("\n2. All Orders:\n")
print(orders_df)  


2. All Orders:

   id  customerId
0   1           3
1   2           1


In [35]:
# 3. Get customers who have placed an order (INNER JOIN):

print("\n3. Customers who have placed an order (INNER JOIN):\n")
customers_with_orders = pd.merge(customers_df, orders_df, left_on='id', right_on='customerId', how='inner')
print(customers_with_orders)


3. Customers who have placed an order (INNER JOIN):

   id_x    name  id_y  customerId
0     1    Ashu     2           1
1     3  Sameer     1           3


In [37]:
# 4. Get customers and their order details (LEFT JOIN - all customers, matching orders)
print("\n4. Customers and their order details (LEFT JOIN):")
customers_left_join_orders = pd.merge(customers_df, orders_df, left_on='id', right_on='customerId', how='inner')
print(customers_left_join_orders)


4. Customers and their order details (LEFT JOIN):
   id_x    name  id_y  customerId
0     1    Ashu     2           1
1     3  Sameer     1           3
