[Reference](https://medium.com/@tubelwj/six-advanced-pandas-functions-commonly-used-in-data-analysis-57353cb91193)

# GroupBy

In [1]:
import pandas as pd

# Sample dataframe with U.S. cities etc.
df = pd.DataFrame({
    'City': ['New York', 'Los Angeles', 'New York', 'Los Angeles', 'Chicago', 'New York', 'Chicago', 'Los Angeles'],
    'Category': ['Electronics', 'Clothing', 'Electronics', 'Clothing', 'Electronics', 'Clothing', 'Clothing', 'Electronics'],
    'Sales': [100, 200, 150, 250, 300, 180, 220, 270],
    'Quantity': [2, 5, 3, 7, 6, 4, 8, 9]
})

# Grouping by 'City' and 'Category' and summing up the other columns
grouped = df.groupby(['City', 'Category']).sum()

print(grouped)

# Grouping by 'City' and 'Category' and calculating sum, mean, and max for Sales and Quantity
grouped = df.groupby(['City', 'Category']).agg({
    'Sales': ['sum', 'mean', 'max'],
    'Quantity': ['sum', 'mean', 'max']
})

print(grouped)

                         Sales  Quantity
City        Category                    
Chicago     Clothing       220         8
            Electronics    300         6
Los Angeles Clothing       450        12
            Electronics    270         9
New York    Clothing       180         4
            Electronics    250         5
                        Sales             Quantity         
                          sum   mean  max      sum mean max
City        Category                                       
Chicago     Clothing      220  220.0  220        8  8.0   8
            Electronics   300  300.0  300        6  6.0   6
Los Angeles Clothing      450  225.0  250       12  6.0   7
            Electronics   270  270.0  270        9  9.0   9
New York    Clothing      180  180.0  180        4  4.0   4
            Electronics   250  125.0  150        5  2.5   3


# Multi-Indexing

In [2]:
import pandas as pd
import numpy as np

# Creating a more complex dataframe with multi-index
index = pd.MultiIndex.from_tuples([('New York', 'Q1', 'one'), ('New York', 'Q1', 'two'),
                                   ('Los Angeles', 'Q1', 'one'), ('Los Angeles', 'Q1', 'two'),
                                   ('Chicago', 'Q2', 'one'), ('Chicago', 'Q2', 'two'),
                                   ('New York', 'Q2', 'one'), ('New York', 'Q2', 'two')],
                                  names=['City', 'Quarter', 'ID'])

# Creating random data for Sales, Profit, and Quantity
sales = np.round(np.random.uniform(100, 500, 8), 2)  # Sales between 100 and 500
profit = np.round(np.random.uniform(0, 0.35, 8), 2)     # Profit between 0 and 0.35
quantity = np.random.randint(1, 20, 8)               # Quantity as integers

# Creating the dataframe
df_multi = pd.DataFrame({'Sales': sales, 'Profit(%)': profit, 'Quantity': quantity}, index=index)

print(df_multi)

                          Sales  Profit(%)  Quantity
City        Quarter ID                              
New York    Q1      one  489.06       0.29        15
                    two  336.44       0.33         2
Los Angeles Q1      one  292.79       0.32         6
                    two  116.29       0.03         7
Chicago     Q2      one  219.42       0.15         3
                    two  479.45       0.22         8
New York    Q2      one  111.90       0.26        18
                    two  341.27       0.08         9


# Pivot Tables

In [3]:
import numpy as np
import pandas as pd

# Create a sample DataFrame
data = {
    'City': ['New York', 'Los Angeles', 'New York', 'Los Angeles', 'New York', 'Los Angeles', 'New York', 'Los Angeles'],
    'Quarter': ['Q1', 'Q1', 'Q2', 'Q2', 'Q1', 'Q1', 'Q2', 'Q2'],
    'NumOfHouses': [10, 20, 30, 40, 50, 60, 70, 80],
    'TotalValue': [100, 200, 300, 400, 500, 600, 700, 800]
}

df = pd.DataFrame(data)

# Create a pivot table, grouped by columns A and B, and sum C and D
pivot = pd.pivot_table(df, values=['NumOfHouses', 'TotalValue'], index=['City', 'Quarter'], aggfunc=np.sum)

# Output the result
print(pivot)

                     NumOfHouses  TotalValue
City        Quarter                         
Los Angeles Q1                80         800
            Q2               120        1200
New York    Q1                60         600
            Q2               100        1000


  pivot = pd.pivot_table(df, values=['NumOfHouses', 'TotalValue'], index=['City', 'Quarter'], aggfunc=np.sum)


# Merging, Joining, and Concatenating

In [4]:
import pandas as pd

# Create the first DataFrame
data1 = {'Name': ['Alice', 'Bob', 'Charlie'],
         'Age': [25, 30, 35]}
df1 = pd.DataFrame(data1)

# Create the second DataFrame
data2 = {'Name': ['David', 'Eva', 'Frank'],
         'Age': [40, 45, 50]}
df2 = pd.DataFrame(data2)

# Use pandas.concat() to concatenate the two DataFrames
df_combined = pd.concat([df1, df2], ignore_index=True)

# Output the result
print(df_combined)

      Name  Age
0    Alice   25
1      Bob   30
2  Charlie   35
3    David   40
4      Eva   45
5    Frank   50


In [5]:
import pandas as pd

# Create two sample DataFrames
data1 = {
    'EmployeeID': [1, 2, 3, 4],
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Department': ['HR', 'IT', 'Finance', 'IT']
}

data2 = {
    'EmployeeID': [2, 4, 3, 5],
    'Salary': [70000, 80000, 60000, 90000]
}

# Convert dictionaries to DataFrames
df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)

# Display the original DataFrames
print("DataFrame 1:")
print(df1)
print("\nDataFrame 2:")
print(df2)

# Merge the DataFrames on 'EmployeeID' using an inner join
merged_df = pd.merge(df1, df2, on='EmployeeID', how='inner')

# Display the merged DataFrame
print("\nMerged DataFrame (inner join):")
print(merged_df)

# Merge the DataFrames on 'EmployeeID' using a left join
left_merged_df = pd.merge(df1, df2, on='EmployeeID', how='left')

# Display the left-merged DataFrame
print("\nMerged DataFrame (left join):")
print(left_merged_df)

DataFrame 1:
   EmployeeID     Name Department
0           1    Alice         HR
1           2      Bob         IT
2           3  Charlie    Finance
3           4    David         IT

DataFrame 2:
   EmployeeID  Salary
0           2   70000
1           4   80000
2           3   60000
3           5   90000

Merged DataFrame (inner join):
   EmployeeID     Name Department  Salary
0           2      Bob         IT   70000
1           3  Charlie    Finance   60000
2           4    David         IT   80000

Merged DataFrame (left join):
   EmployeeID     Name Department   Salary
0           1    Alice         HR      NaN
1           2      Bob         IT  70000.0
2           3  Charlie    Finance  60000.0
3           4    David         IT  80000.0


In [6]:
import pandas as pd

# Create two sample DataFrames
data1 = {
    'EmployeeID': [1, 2, 3, 4],
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Department': ['HR', 'IT', 'Finance', 'IT']
}

data2 = {
    'EmployeeID': [2, 4, 3, 5],
    'Salary': [70000, 80000, 60000, 90000]
}

# Convert dictionaries to DataFrames
df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)

# Display the original DataFrames
print("DataFrame 1:")
print(df1)
print("\nDataFrame 2:")
print(df2)

# Set 'EmployeeID' as the index for both DataFrames to facilitate join
df1.set_index('EmployeeID', inplace=True)
df2.set_index('EmployeeID', inplace=True)

# Perform an inner join on the two DataFrames
joined_df_inner = df1.join(df2, how='inner')

# Display the result of the inner join
print("\nJoined DataFrame (inner join):")
print(joined_df_inner)

# Perform a left join on the two DataFrames
joined_df_left = df1.join(df2, how='left')

# Display the result of the left join
print("\nJoined DataFrame (left join):")
print(joined_df_left)

# Perform a right join on the two DataFrames
joined_df_right = df1.join(df2, how='right')

# Display the result of the right join
print("\nJoined DataFrame (right join):")
print(joined_df_right)

# Perform an outer join on the two DataFrames
joined_df_outer = df1.join(df2, how='outer')

# Display the result of the outer join
print("\nJoined DataFrame (outer join):")
print(joined_df_outer)

DataFrame 1:
   EmployeeID     Name Department
0           1    Alice         HR
1           2      Bob         IT
2           3  Charlie    Finance
3           4    David         IT

DataFrame 2:
   EmployeeID  Salary
0           2   70000
1           4   80000
2           3   60000
3           5   90000

Joined DataFrame (inner join):
               Name Department  Salary
EmployeeID                            
2               Bob         IT   70000
3           Charlie    Finance   60000
4             David         IT   80000

Joined DataFrame (left join):
               Name Department   Salary
EmployeeID                             
1             Alice         HR      NaN
2               Bob         IT  70000.0
3           Charlie    Finance  60000.0
4             David         IT  80000.0

Joined DataFrame (right join):
               Name Department  Salary
EmployeeID                            
2               Bob         IT   70000
4             David         IT   80000
3      

# Using `apply()` and `map()` for Data Transformation

In [7]:
import pandas as pd

# Create a sample DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35],
    'Salary': [50000, 60000, 70000]
}

df = pd.DataFrame(data)

# Define a function to calculate tax (for example, 10% tax)
def calculate_tax(salary):
    return salary * 0.10

# Apply the function to the 'Salary' column
df['Tax'] = df['Salary'].apply(calculate_tax)

# Display the updated DataFrame
print(df)

      Name  Age  Salary     Tax
0    Alice   25   50000  5000.0
1      Bob   30   60000  6000.0
2  Charlie   35   70000  7000.0


In [8]:
import pandas as pd

# Create a sample DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35],
    'Salary': [50000, 60000, 70000]
}

df = pd.DataFrame(data)

# Define a function to calculate tax (for example, 10% tax)
def calculate_tax(salary):
    return salary * 0.10

# Apply the function to the 'Salary' column
df['Tax'] = df['Salary'].apply(calculate_tax)

# Display the updated DataFrame
print(df)

      Name  Age  Salary     Tax
0    Alice   25   50000  5000.0
1      Bob   30   60000  6000.0
2  Charlie   35   70000  7000.0


# Query Function

In [9]:
import pandas as pd

# Sample DataFrame
df = pd.DataFrame({
    'X': range(5, 11),
    'Y': range(20, 121, 20),
    'City': ['New York', 'Los Angeles', 'New York', 'Chicago', 'Los Angeles', 'Chicago']
})

# Using query() to filter rows
filtered_df = df.query('(X < 8) & (City == "New York")')

print(filtered_df)

   X   Y      City
0  5  20  New York
2  7  60  New York
