[Reference](https://towardsdatascience.com/5-useful-tips-for-exploratory-data-analysis-using-pandas-in-python-7c05808c9408)

In [1]:
def missing_pct(df):
    # Calculate percentage of missing for each column
    s_missing = df.isnull().sum() * 100 / df.shape[0]
    # Convert the series back to data frame
    df_missing = pd.DataFrame(s_missing).round(2)
    # Reset and rename the index
    df_missing = df_missing.reset_index().rename(
                    columns={
                            'index':'Column',
                            0:'Missing_Percentage (%)'
                    }
                )
    # Sort the data frame
    df_missing = df_missing.sort_values('Missing_Percentage (%)', ascending=False)
    return df_missing

In [7]:
missing_pct(df)

Unnamed: 0,Column,Missing_Percentage (%)
0,Category,0.0
1,Value,0.0
2,rank,0.0


In [2]:
def find_max_in_group(df, group_col, val_col, tie_for_first=False):
    # Decide ranking method
    if tie_for_first:
        rank_method = 'min'
    else:
        rank_method = 'first'
    # Add rank number for each group
    df["rank"] = df.groupby(group_col)[val_col].rank(method=rank_method, ascending=False)
    # Only return rank == 1
    return df[df['rank'] == 1].drop(['rank'], axis=1)

In [4]:
import pandas as pd
df = pd.DataFrame({'Category':['a','a','b','b','b','c'], 'Value':[3,2,5,8,8,6]})
df

Unnamed: 0,Category,Value
0,a,3
1,a,2
2,b,5
3,b,8
4,b,8
5,c,6


In [5]:
find_max_in_group(df, 'Category', 'Value', True)

Unnamed: 0,Category,Value
0,a,3
3,b,8
4,b,8
5,c,6


In [6]:
find_max_in_group(df, 'Category', 'Value', False)

Unnamed: 0,Category,Value
0,a,3
3,b,8
5,c,6


In [8]:
df = pd.DataFrame({'cate':['a','b','c'],'values':[[10,20], [20,30], [30,40]]})

In [9]:
df['values'].apply(pd.Series)

Unnamed: 0,0,1
0,10,20
1,20,30
2,30,40


In [10]:
def expand_collection_type_column(df, target_col, col_names=None):
    # Expand the list into multiple columns
    df_expand = df[target_col].apply(pd.Series)
    # Concatenate the expanded data frame with the original one
    df = pd.concat([df, df_expand], axis=1).drop(['values'], axis=1)
    # Rename columns if passed in
    if col_names:
        df = df.rename(columns={index:name for index, name in enumerate(col_names)})
    return df

In [12]:
import numpy as np

In [13]:
np.random.seed(0)
assessment = pd.DataFrame(
    np.random.normal(loc=70, scale=10, size=5*4).reshape(5, -1),
    columns=['Term 1 Math', 'Term 2 Math', 'Term 1 English', 'Term 2 English'],
    index=['Alice', 'Bob', 'Chris', 'David', 'Emily']
).astype(int)

assessment

Unnamed: 0,Term 1 Math,Term 2 Math,Term 1 English,Term 2 English
Alice,87,74,79,92
Bob,88,60,79,68
Chris,68,74,71,84
David,77,71,74,73
Emily,84,67,73,61


In [14]:
mapping = {
    'Term 1 Math':'Math', 
    'Term 2 Math':'Math', 
    'Term 1 English':'English',
    'Term 2 English':'English'
}
agg_results = assessment.groupby(mapping, axis=1)
agg_results.mean()

Unnamed: 0,English,Math
Alice,85.5,80.5
Bob,73.5,74.0
Chris,77.5,71.0
David,73.5,74.0
Emily,67.0,75.5
