### libs

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl

### data

In [2]:
import seaborn as sns

df = sns.load_dataset('titanic')[['survived', 'who','age','fare']]
print(f"{df.shape = }")
print(df.head().to_string())

df.shape = (891, 4)
   survived    who   age     fare
0         0    man  22.0   7.2500
1         1  woman  38.0  71.2833
2         1  woman  26.0   7.9250
3         1  woman  35.0  53.1000
4         0    man  35.0   8.0500


### Different return types from apply()


In [7]:
print("Scalar return:")
result1 = df.groupby('who').apply(lambda x: len(x), include_groups=False)
print(f"Type: {type(result1)}, Shape: {result1.shape}")
result1

Scalar return:
Type: <class 'pandas.core.series.Series'>, Shape: (3,)


who
child     83
man      537
woman    271
dtype: int64

In [8]:
print("\nSeries return (your case):")
result2 = df.groupby('who').apply(lambda x: pd.Series(np.arange(len(x)), x.index), include_groups=False)
print(f"Type: {type(result2)}, Index levels: {result2.index.nlevels}")
result2


Series return (your case):
Type: <class 'pandas.core.series.Series'>, Index levels: 2


who       
child  7        0
       9        1
       10       2
       14       3
       16       4
             ... 
woman  880    266
       882    267
       885    268
       887    269
       888    270
Length: 891, dtype: int64

In [9]:
print("\nDataFrame return:")
result3 = df.groupby('who').apply(lambda x: x[['age', 'fare']].head(2), include_groups=False)
print(f"Type: {type(result3)}, Index levels: {result3.index.nlevels}")
result3


DataFrame return:
Type: <class 'pandas.core.frame.DataFrame'>, Index levels: 2


Unnamed: 0_level_0,Unnamed: 1_level_0,age,fare
who,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
child,7,2.0,21.075
child,9,14.0,30.0708
man,0,22.0,7.25
man,4,35.0,8.05
woman,1,38.0,71.2833
woman,2,26.0,7.925


In [3]:
import pandas as pd

# Create a list of file sizes
file_sizes = ["1G", "500M", "750M", "100K", "50K"]

# Create a DataFrame with the file sizes
df = pd.DataFrame({"size": file_sizes})

# Display the DataFrame
print(df)

   size
0    1G
1  500M
2  750M
3  100K
4   50K


In [4]:
def parse_size(size_str):
    units = {"G": 1e9, "M": 1e6, "K": 1e3}
    size = float(size_str[:-1])
    unit = size_str[-1]
    return size * units[unit]


def handle_size(size_str):
    try:
        return parse_size(size_str)
    except:
        return None


df["size"] = df["size"].apply(handle_size)
df["size_mb"] = df["size"] / 1e6

In [5]:
df

Unnamed: 0,size,size_mb
0,1000000000.0,1000.0
1,500000000.0,500.0
2,750000000.0,750.0
3,100000.0,0.1
4,50000.0,0.05
