In [1]:
# To get multiple outputs in the same cell

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# Import the required libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
df1 = pd.DataFrame({'id': [1,2],
                   'name': ['a','b'],
                   'prem1' : [100,280],
                   'prem2' : [np.NaN,180],
                   'prem3' : [300,np.NaN],
                   'disc1' : [20,40],
                   'disc2' : [np.NaN,30],
                   'disc3' : [50,np.NaN],})
df1

Unnamed: 0,id,name,prem1,prem2,prem3,disc1,disc2,disc3
0,1,a,100,,300.0,20,,50.0
1,2,b,280,180.0,,40,30.0,


In [4]:
df1_melted = pd.wide_to_long(df1, i=['id','name'], j='month', stubnames=['prem','disc'])
df_long = df1_melted.reset_index()

In [5]:
df_long

Unnamed: 0,id,name,month,prem,disc
0,1,a,1,100.0,20.0
1,1,a,2,,
2,1,a,3,300.0,50.0
3,2,b,1,280.0,40.0
4,2,b,2,180.0,30.0
5,2,b,3,,


In [6]:
df_long.groupby('id').min()

Unnamed: 0_level_0,name,month,prem,disc
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,a,1,100.0,20.0
2,b,1,180.0,30.0


In [7]:
df_long.groupby('id').max()

Unnamed: 0_level_0,name,month,prem,disc
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,a,3,300.0,50.0
2,b,3,280.0,40.0


In [8]:
df_long.groupby('id').first()

Unnamed: 0_level_0,name,month,prem,disc
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,a,1,100.0,20.0
2,b,1,280.0,40.0


In [9]:
df_long.groupby('id').last()

Unnamed: 0_level_0,name,month,prem,disc
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,a,3,300.0,50.0
2,b,3,180.0,30.0


In [10]:
df_long.groupby('id').head(2)

Unnamed: 0,id,name,month,prem,disc
0,1,a,1,100.0,20.0
1,1,a,2,,
3,2,b,1,280.0,40.0
4,2,b,2,180.0,30.0


In [11]:
df_long.groupby('id').tail(1)

Unnamed: 0,id,name,month,prem,disc
2,1,a,3,300.0,50.0
5,2,b,3,,


In [12]:
df_long2 = df_long.sort_values(['id','prem'])

In [13]:
df_long2.groupby('id').head(2)

Unnamed: 0,id,name,month,prem,disc
0,1,a,1,100.0,20.0
2,1,a,3,300.0,50.0
4,2,b,2,180.0,30.0
3,2,b,1,280.0,40.0


In [14]:
df_long2.groupby('id').tail(1)

Unnamed: 0,id,name,month,prem,disc
1,1,a,2,,
5,2,b,3,,


## TRANSFORM

In [15]:
df_long['flag'] = df_long.groupby('id')['prem'].transform(lambda x : x == x.max())
df_long

Unnamed: 0,id,name,month,prem,disc,flag
0,1,a,1,100.0,20.0,False
1,1,a,2,,,False
2,1,a,3,300.0,50.0,True
3,2,b,1,280.0,40.0,True
4,2,b,2,180.0,30.0,False
5,2,b,3,,,False


In [16]:
df_long

Unnamed: 0,id,name,month,prem,disc,flag
0,1,a,1,100.0,20.0,False
1,1,a,2,,,False
2,1,a,3,300.0,50.0,True
3,2,b,1,280.0,40.0,True
4,2,b,2,180.0,30.0,False
5,2,b,3,,,False


In [17]:
df_long.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 6 columns):
id       6 non-null int64
name     6 non-null object
month    6 non-null int64
prem     4 non-null float64
disc     4 non-null float64
flag     6 non-null bool
dtypes: bool(1), float64(2), int64(2), object(1)
memory usage: 374.0+ bytes


https://www.analyticsvidhya.com/blog/2020/03/understanding-transform-function-python/

## Creating running totals with cumsum function

In [20]:
d = {"salesperson":["Nico", "Carlos", "Juan", "Nico", "Nico", "Juan", "Maria", "Carlos"], "item":[10, 120, 130, 200, 300, 550, 12.3, 200]}
df = pd.DataFrame(d)
df

df["running_total"] = df["item"].cumsum()
df["running_total_by_person"] = df.groupby("salesperson")["item"].cumsum()
df

Unnamed: 0,salesperson,item
0,Nico,10.0
1,Carlos,120.0
2,Juan,130.0
3,Nico,200.0
4,Nico,300.0
5,Juan,550.0
6,Maria,12.3
7,Carlos,200.0


Unnamed: 0,salesperson,item,running_total,running_total_by_person,running_total_by_
0,Nico,10.0,10.0,10.0,10.0
1,Carlos,120.0,130.0,120.0,120.0
2,Juan,130.0,260.0,130.0,130.0
3,Nico,200.0,460.0,210.0,200.0
4,Nico,300.0,760.0,510.0,300.0
5,Juan,550.0,1310.0,680.0,550.0
6,Maria,12.3,1322.3,12.3,12.3
7,Carlos,200.0,1522.3,320.0,200.0


## Calculate running count with groups using cumcount() + 1

In [21]:
d = {"salesperson":["Nico", "Carlos", "Juan", "Nico", "Nico", "Juan", "Maria", "Carlos"], "item":["Car", "Truck", "Car", "Truck", "cAr", "Car", "Truck", "Moto"]}
df = pd.DataFrame(d)
df

# Fixing columns
df["salesperson"] = df["salesperson"].str.title()
df["item"] = df["item"].str.title()

df["count_by_person"] = df.groupby("salesperson").cumcount() + 1
df["count_by_item"] = df.groupby("item").cumcount() + 1
df["count_by_both"] = df.groupby(["salesperson","item"]).cumcount() + 1
df

Unnamed: 0,salesperson,item
0,Nico,Car
1,Carlos,Truck
2,Juan,Car
3,Nico,Truck
4,Nico,cAr
5,Juan,Car
6,Maria,Truck
7,Carlos,Moto


Unnamed: 0,salesperson,item,count_by_person,count_by_item,count_by_both
0,Nico,Car,1,1,1
1,Carlos,Truck,1,1,1
2,Juan,Car,1,2,1
3,Nico,Truck,2,2,1
4,Nico,Car,3,3,2
5,Juan,Car,2,4,2
6,Maria,Truck,1,3,1
7,Carlos,Moto,2,1,1
