### Prepared by Abhishek Kumar
### https://www.linkedin.com/in/abhishek-kumar-442337b2/


In [1]:
# To get multiple outputs in the same cell

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# Import the required libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Wide to Long DataFrame
One record to many records based on a ID column

```py
1. df.melt(id_vars=[ ], value_vars=[ ], var_name=[ ], value_name=[ ])
2. pd.wide_to_long(df, i=[ ], j=[ ], stubnames=[ ], sep="_") 
# stubnames provides the flexibility to add the multiple sets of series of variables```
    apply reset_index() to flatten out the indices and make the it more usable.


## df.melt()

In [3]:
df = pd.DataFrame({'id': [1,2],
                   'name': ['a','b'],
                   'prem1' : [100,280],
                   'prem2' : [200,180],
                   'prem3' : [300,80],})
df

Unnamed: 0,id,name,prem1,prem2,prem3
0,1,a,100,200,300
1,2,b,280,180,80


In [4]:
df_melted = df.melt(id_vars=['id','name']).sort_values('id')
df_melted

Unnamed: 0,id,name,variable,value
0,1,a,prem1,100
2,1,a,prem2,200
4,1,a,prem3,300
1,2,b,prem1,280
3,2,b,prem2,180
5,2,b,prem3,80


In [5]:
df2 = pd.DataFrame({'id': [1,2],
                   'name': ['a','b'],
                   'prem1' : [100,280],
                   'prem2' : [np.NaN,180],
                   'prem3' : [300,np.NaN],})
df2

Unnamed: 0,id,name,prem1,prem2,prem3
0,1,a,100,,300.0
1,2,b,280,180.0,


In [6]:
df2_melted = df2.melt(id_vars=['id','name'], var_name = 'month', value_name = 'premiums').sort_values('id')
df2_melted

Unnamed: 0,id,name,month,premiums
0,1,a,prem1,100.0
2,1,a,prem2,
4,1,a,prem3,300.0
1,2,b,prem1,280.0
3,2,b,prem2,180.0
5,2,b,prem3,


In [7]:
# df2_melted = df2_melted.loc[]

In [8]:
df3 = df2.copy()

df3_melted = df3.melt(id_vars=['id'], value_vars=['prem1','prem2','prem3'], var_name = 'month', value_name = 'premiums').sort_values('id')
df3_melted

Unnamed: 0,id,month,premiums
0,1,prem1,100.0
2,1,prem2,
4,1,prem3,300.0
1,2,prem1,280.0
3,2,prem2,180.0
5,2,prem3,


### Example 2

In [9]:
# Setup : DataFrame creation

salary = [['1','Abhishek Kumar','AIML', 'Machine Learning Engineer','M', 'Y', '04051990', 1121000],
          ['2','Arjun Kumar','DM', 'Tech Lead','M', 'Y', '09031992', 109000],
          ['3','Vivek Raj','DM', 'Devops Engineer','M', 'N', np.NaN , 827000],
          ['4','Mika Singh','DM', 'Data Analyst','F', 'Y', '15101991',  np.NaN],
          ['5','Anusha Yenduri','AIML', 'Data Scientist','F', 'Y', '01011989',  921000],
          ['6','Ritesh Srivastava','AIML', 'Data Engineer','M', 'Y', np.NaN, 785000]]

columns_name=['Emp_Id','Emp_Name','Department','Role','Gender', 'WFH Status', 'DOB', 'Salary']

emp_df = pd.DataFrame(salary,columns=columns_name)
emp_df

Unnamed: 0,Emp_Id,Emp_Name,Department,Role,Gender,WFH Status,DOB,Salary
0,1,Abhishek Kumar,AIML,Machine Learning Engineer,M,Y,4051990.0,1121000.0
1,2,Arjun Kumar,DM,Tech Lead,M,Y,9031992.0,109000.0
2,3,Vivek Raj,DM,Devops Engineer,M,N,,827000.0
3,4,Mika Singh,DM,Data Analyst,F,Y,15101991.0,
4,5,Anusha Yenduri,AIML,Data Scientist,F,Y,1011989.0,921000.0
5,6,Ritesh Srivastava,AIML,Data Engineer,M,Y,,785000.0


In [10]:
# Sample data set-up

emp_df_1 = emp_df.copy()

emp_df_1['Holi_Bonus'] = emp_df_1['Salary']*0.05
emp_df_1['Diwali_Bonus'] = emp_df_1['Salary']*0.075
emp_df_1['Yearly_Bonus'] = emp_df_1['Salary']*0.10
emp_df_1

Unnamed: 0,Emp_Id,Emp_Name,Department,Role,Gender,WFH Status,DOB,Salary,Holi_Bonus,Diwali_Bonus,Yearly_Bonus
0,1,Abhishek Kumar,AIML,Machine Learning Engineer,M,Y,4051990.0,1121000.0,56050.0,84075.0,112100.0
1,2,Arjun Kumar,DM,Tech Lead,M,Y,9031992.0,109000.0,5450.0,8175.0,10900.0
2,3,Vivek Raj,DM,Devops Engineer,M,N,,827000.0,41350.0,62025.0,82700.0
3,4,Mika Singh,DM,Data Analyst,F,Y,15101991.0,,,,
4,5,Anusha Yenduri,AIML,Data Scientist,F,Y,1011989.0,921000.0,46050.0,69075.0,92100.0
5,6,Ritesh Srivastava,AIML,Data Engineer,M,Y,,785000.0,39250.0,58875.0,78500.0


In [11]:
emp_df_1_long = emp_df_1.melt(id_vars = ['Emp_Id','Emp_Name'] , 
                              value_vars = [ 'Holi_Bonus','Diwali_Bonus','Yearly_Bonus' ],
                              var_name = 'Event',
                              value_name = 'Bonus' )
emp_df_1_long

Unnamed: 0,Emp_Id,Emp_Name,Event,Bonus
0,1,Abhishek Kumar,Holi_Bonus,56050.0
1,2,Arjun Kumar,Holi_Bonus,5450.0
2,3,Vivek Raj,Holi_Bonus,41350.0
3,4,Mika Singh,Holi_Bonus,
4,5,Anusha Yenduri,Holi_Bonus,46050.0
5,6,Ritesh Srivastava,Holi_Bonus,39250.0
6,1,Abhishek Kumar,Diwali_Bonus,84075.0
7,2,Arjun Kumar,Diwali_Bonus,8175.0
8,3,Vivek Raj,Diwali_Bonus,62025.0
9,4,Mika Singh,Diwali_Bonus,


## pd.wide_to_long()

In [12]:
df4 = pd.DataFrame({'id': [1,2],
                   'name': ['a','b'],
                   'prem1' : [100,280],
                   'prem2' : [np.NaN,180],
                   'prem3' : [300,np.NaN],
                   'disc1' : [20,40],
                   'disc2' : [np.NaN,30],
                   'disc3' : [50,np.NaN],})
df4

Unnamed: 0,id,name,prem1,prem2,prem3,disc1,disc2,disc3
0,1,a,100,,300.0,20,,50.0
1,2,b,280,180.0,,40,30.0,


In [13]:
# melt is not working as expected.
# There are 2 sets of sequential columns and both the sets are transposed to the same column
# NOT Working as EXPECTED

# df4_melted = df4.melt(id_vars=['id','name'], value_vars=['prem1','prem2','prem3','disc1','disc2','disc3'], var_name = 'month', value_name = 'values').sort_values('id').reset_index(drop='index')
# df4_melted

#### Another way to transform is to use the wide_to_long() panel data convenience function. It is less flexible than melt(), but more user-friendly.

In [14]:
df4_melted1 = pd.wide_to_long(df4, i=['id','name'], j='month', stubnames=['prem','disc'])
df4_melted1

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,prem,disc
id,name,month,Unnamed: 3_level_1,Unnamed: 4_level_1
1,a,1,100.0,20.0
1,a,2,,
1,a,3,300.0,50.0
2,b,1,280.0,40.0
2,b,2,180.0,30.0
2,b,3,,


In [15]:
df4_melted1.reset_index(inplace=True)
df4_melted1

Unnamed: 0,id,name,month,prem,disc
0,1,a,1,100.0,20.0
1,1,a,2,,
2,1,a,3,300.0,50.0
3,2,b,1,280.0,40.0
4,2,b,2,180.0,30.0
5,2,b,3,,


In [16]:
# Trying to see the usage of suffix= parameter. Not completed yet.
# df4_melted2 = pd.wide_to_long(df4, i=['id','name'], j='month', stubnames=['prem','disc'])#, suffix='1')
# df4_melted2

## df.stack()

In [17]:
df5 = pd.DataFrame({'id': [1,2],
                   'name': ['a','b'],
                   'prem1' : [100,280],
                   'prem2' : [np.NaN,180],
                   'prem3' : [300,np.NaN]})
df5

Unnamed: 0,id,name,prem1,prem2,prem3
0,1,a,100,,300.0
1,2,b,280,180.0,


In [18]:
df5.set_index(['id','name']).stack().reset_index()

Unnamed: 0,id,name,level_2,0
0,1,a,prem1,100.0
1,1,a,prem3,300.0
2,2,b,prem1,280.0
3,2,b,prem2,180.0


    > 1. Important thing to note - there is single series of variable (perm1 - perm3), which is transposed here.
    > 2. The index is set before the process of stacking.
    > 3. If there is multile sets of series of variables, then this would not work as expected.
    > 4. By default, dropna = True, and hence it drops the NaN values

In [19]:
df5.set_index(['id','name']).stack(dropna=False).reset_index()

Unnamed: 0,id,name,level_2,0
0,1,a,prem1,100.0
1,1,a,prem2,
2,1,a,prem3,300.0
3,2,b,prem1,280.0
4,2,b,prem2,180.0
5,2,b,prem3,


In [20]:
df6 = pd.DataFrame({'id': [1,2],
                   'name': ['a','b'],
                   'prem1' : [100,280],
                   'prem2' : [np.NaN,180],
                   'prem3' : [300,np.NaN],
                   'disc1' : [20,40],
                   'disc2' : [np.NaN,30],
                   'disc3' : [50,np.NaN]})
df6

Unnamed: 0,id,name,prem1,prem2,prem3,disc1,disc2,disc3
0,1,a,100,,300.0,20,,50.0
1,2,b,280,180.0,,40,30.0,


In [21]:
df6_stacked = df6.set_index(['id','name']).stack().reset_index()
df6_stacked

Unnamed: 0,id,name,level_2,0
0,1,a,prem1,100.0
1,1,a,prem3,300.0
2,1,a,disc1,20.0
3,1,a,disc3,50.0
4,2,b,prem1,280.0
5,2,b,prem2,180.0
6,2,b,disc1,40.0
7,2,b,disc2,30.0


In [22]:
# stack is not working as expected.
# There are 2 sets of sequential columns and both the sets are transposed to the same column
# NOT Working as EXPECTED

# Long to Wide DataFrame
Multiple records per ID to a single(one) record of each ID.

```python
1. pd.pivot()
2. pd.pivot_table()
3. Use df.set_index([id_vars columns and var_name columns]) and chain it with .unstack(level=2 (here))```

### pd.pivot() - Does not work for multiple indexes, So in this case, does not work

### pd.pivot_table() - Although it is for aggregation, it worked to change LONG to WIDE Data

In [23]:
df4_melted1

Unnamed: 0,id,name,month,prem,disc
0,1,a,1,100.0,20.0
1,1,a,2,,
2,1,a,3,300.0,50.0
3,2,b,1,280.0,40.0
4,2,b,2,180.0,30.0
5,2,b,3,,


In [24]:
df_wide = pd.pivot_table(df4_melted1, index=['id','name'], columns='month', values=['prem','disc'])
df_wide

Unnamed: 0_level_0,Unnamed: 1_level_0,disc,disc,disc,prem,prem,prem
Unnamed: 0_level_1,month,1,2,3,1,2,3
id,name,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
1,a,20.0,,50.0,100.0,,300.0
2,b,40.0,30.0,,280.0,180.0,


In [25]:
df_wide.columns

MultiIndex([('disc', 1),
            ('disc', 2),
            ('disc', 3),
            ('prem', 1),
            ('prem', 2),
            ('prem', 3)],
           names=[None, 'month'])

In [26]:
# df_wide = df4_melted1.pivot(index=['id','name'], columns='month', values=['prem'])
# df_wide

In [27]:
df_wide.columns = ['_'.join(map(str, tup)) for tup in df_wide.columns]
df_wide.reset_index()

Unnamed: 0,id,name,disc_1,disc_2,disc_3,prem_1,prem_2,prem_3
0,1,a,20.0,,50.0,100.0,,300.0
1,2,b,40.0,30.0,,280.0,180.0,


### df.unstack() - 
#### Use df.set_index([id_vars columns and var_name columns]) and chain it with .unstack(level=2 (here))

In [28]:
wide_df = df4_melted1.set_index(['id','name','month']).unstack(level=2)
wide_df

Unnamed: 0_level_0,Unnamed: 1_level_0,prem,prem,prem,disc,disc,disc
Unnamed: 0_level_1,month,1,2,3,1,2,3
id,name,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
1,a,100.0,,300.0,20.0,,50.0
2,b,280.0,180.0,,40.0,30.0,


ID: level = 0; RegionVariable: level = 1; 'EXP': level = 2; 'ModelID': level = 3;

In [29]:
wide_df.columns

MultiIndex([('prem', 1),
            ('prem', 2),
            ('prem', 3),
            ('disc', 1),
            ('disc', 2),
            ('disc', 3)],
           names=[None, 'month'])

In [30]:
# Code to flatten the list and at the same time concatenating it.

wide_df.columns = ['_'.join(map(str, tup)) for tup in wide_df.columns] # Everything is back to the first dataframe

In [31]:
wide_df.columns

Index(['prem_1', 'prem_2', 'prem_3', 'disc_1', 'disc_2', 'disc_3'], dtype='object')

In [32]:
wide_df

Unnamed: 0_level_0,Unnamed: 1_level_0,prem_1,prem_2,prem_3,disc_1,disc_2,disc_3
id,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,a,100.0,,300.0,20.0,,50.0
2,b,280.0,180.0,,40.0,30.0,


In [33]:
wide_df.reset_index()

Unnamed: 0,id,name,prem_1,prem_2,prem_3,disc_1,disc_2,disc_3
0,1,a,100.0,,300.0,20.0,,50.0
1,2,b,280.0,180.0,,40.0,30.0,


### Example 2

In [34]:
emp_df_1_long

Unnamed: 0,Emp_Id,Emp_Name,Event,Bonus
0,1,Abhishek Kumar,Holi_Bonus,56050.0
1,2,Arjun Kumar,Holi_Bonus,5450.0
2,3,Vivek Raj,Holi_Bonus,41350.0
3,4,Mika Singh,Holi_Bonus,
4,5,Anusha Yenduri,Holi_Bonus,46050.0
5,6,Ritesh Srivastava,Holi_Bonus,39250.0
6,1,Abhishek Kumar,Diwali_Bonus,84075.0
7,2,Arjun Kumar,Diwali_Bonus,8175.0
8,3,Vivek Raj,Diwali_Bonus,62025.0
9,4,Mika Singh,Diwali_Bonus,


In [35]:
emp_df_1_wide_1 = emp_df_1_long.pivot_table(index =  ['Emp_Id','Emp_Name'] ,
                                          columns = 'Event',
                                          values = 'Bonus' ).reset_index()
emp_df_1_wide_1

Event,Emp_Id,Emp_Name,Diwali_Bonus,Holi_Bonus,Yearly_Bonus
0,1,Abhishek Kumar,84075.0,56050.0,112100.0
1,2,Arjun Kumar,8175.0,5450.0,10900.0
2,3,Vivek Raj,62025.0,41350.0,82700.0
3,5,Anusha Yenduri,69075.0,46050.0,92100.0
4,6,Ritesh Srivastava,58875.0,39250.0,78500.0


In [36]:
emp_df_1_wide_2 = emp_df_1_long.pivot_table(index =  ['Emp_Id','Emp_Name'] ,
                                           columns = 'Event',
                                           values = 'Bonus',
                                           margins = True ).reset_index()  # default aggfunc = 'mean'
emp_df_1_wide_2

Event,Emp_Id,Emp_Name,Diwali_Bonus,Holi_Bonus,Yearly_Bonus,All
0,1,Abhishek Kumar,84075.0,56050.0,112100.0,84075.0
1,2,Arjun Kumar,8175.0,5450.0,10900.0,8175.0
2,3,Vivek Raj,62025.0,41350.0,82700.0,62025.0
3,5,Anusha Yenduri,69075.0,46050.0,92100.0,69075.0
4,6,Ritesh Srivastava,58875.0,39250.0,78500.0,58875.0
5,All,,56445.0,37630.0,75260.0,56445.0


In [37]:
emp_df_1_wide_3 = emp_df_1_long.pivot_table(index =  ['Emp_Id','Emp_Name'] ,
                                           columns = 'Event',
                                           values = 'Bonus',
                                           margins = True,
                                           aggfunc = 'sum').reset_index()
emp_df_1_wide_3

Event,Emp_Id,Emp_Name,Diwali_Bonus,Holi_Bonus,Yearly_Bonus,All
0,1,Abhishek Kumar,84075.0,56050.0,112100.0,252225.0
1,2,Arjun Kumar,8175.0,5450.0,10900.0,24525.0
2,3,Vivek Raj,62025.0,41350.0,82700.0,186075.0
3,4,Mika Singh,0.0,0.0,0.0,
4,5,Anusha Yenduri,69075.0,46050.0,92100.0,207225.0
5,6,Ritesh Srivastava,58875.0,39250.0,78500.0,176625.0
6,All,,282225.0,188150.0,376300.0,846675.0


In [38]:
# Only row-wise aggregation

emp_df_1_wide_4 = emp_df_1_long.pivot_table(index =  ['Emp_Id','Emp_Name']) # default aggfunc = 'mean'
emp_df_1_wide_4

Unnamed: 0_level_0,Unnamed: 1_level_0,Bonus
Emp_Id,Emp_Name,Unnamed: 2_level_1
1,Abhishek Kumar,84075.0
2,Arjun Kumar,8175.0
3,Vivek Raj,62025.0
5,Anusha Yenduri,69075.0
6,Ritesh Srivastava,58875.0


In [39]:
emp_df_1_wide_4 = emp_df_1_long.pivot_table(index =  ['Emp_Id','Emp_Name'] ,
                                           columns = 'Event',
                                           values = 'Bonus',
                                           fill_value = 1000)
emp_df_1_wide_4

Unnamed: 0_level_0,Event,Diwali_Bonus,Holi_Bonus,Yearly_Bonus
Emp_Id,Emp_Name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Abhishek Kumar,84075,56050,112100
2,Arjun Kumar,8175,5450,10900
3,Vivek Raj,62025,41350,82700
5,Anusha Yenduri,69075,46050,92100
6,Ritesh Srivastava,58875,39250,78500


### There are other techniques that enables Re-Shaping of dataframes.

    i. pivot()
    ii. stack() & unstack()
    iii. wide_to_long()
    iv. crosstab()
    v. cut()