In [1]:
# To get multiple outputs in the same cell

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# Import the required libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Wide to Long DataFrame
One record to many records based on a ID column

```py
1. df.melt(id_vars=[ ], value_vars=[ ], var_name=[ ], value_name=[ ])
2. pd.wide_to_long(df, i=[ ], j=[ ], stubnames=[ ], sep="_") 
# stubnames provides the flexibility to add the multiple sets of series of variables```
    apply reset_index() to flatten out the indices and make the it more usable.


### df.melt()

In [3]:
df = pd.DataFrame({'id': [1,2],
                   'name': ['a','b'],
                   'prem1' : [100,280],
                   'prem2' : [200,180],
                   'prem3' : [300,80],})
df

Unnamed: 0,id,name,prem1,prem2,prem3
0,1,a,100,200,300
1,2,b,280,180,80


In [4]:
df_melted = df.melt(id_vars=['id','name']).sort_values('id')
df_melted

Unnamed: 0,id,name,variable,value
0,1,a,prem1,100
2,1,a,prem2,200
4,1,a,prem3,300
1,2,b,prem1,280
3,2,b,prem2,180
5,2,b,prem3,80


In [5]:
df2 = pd.DataFrame({'id': [1,2],
                   'name': ['a','b'],
                   'prem1' : [100,280],
                   'prem2' : [np.NaN,180],
                   'prem3' : [300,np.NaN],})
df2

Unnamed: 0,id,name,prem1,prem2,prem3
0,1,a,100,,300.0
1,2,b,280,180.0,


In [6]:
df2_melted = df2.melt(id_vars=['id','name'], var_name = 'month', value_name = 'premiums').sort_values('id')
df2_melted

Unnamed: 0,id,name,month,premiums
0,1,a,prem1,100.0
2,1,a,prem2,
4,1,a,prem3,300.0
1,2,b,prem1,280.0
3,2,b,prem2,180.0
5,2,b,prem3,


In [7]:
# df2_melted = df2_melted.loc[]

In [8]:
df3 = df2.copy()

df3_melted = df3.melt(id_vars=['id'], value_vars=['prem1','prem2','prem3'], var_name = 'month', value_name = 'premiums').sort_values('id')
df3_melted

Unnamed: 0,id,month,premiums
0,1,prem1,100.0
2,1,prem2,
4,1,prem3,300.0
1,2,prem1,280.0
3,2,prem2,180.0
5,2,prem3,


### pd.wide_to_long()

In [9]:
df4 = pd.DataFrame({'id': [1,2],
                   'name': ['a','b'],
                   'prem1' : [100,280],
                   'prem2' : [np.NaN,180],
                   'prem3' : [300,np.NaN],
                   'disc1' : [20,40],
                   'disc2' : [np.NaN,30],
                   'disc3' : [50,np.NaN],})
df4

Unnamed: 0,id,name,prem1,prem2,prem3,disc1,disc2,disc3
0,1,a,100,,300.0,20,,50.0
1,2,b,280,180.0,,40,30.0,


In [25]:
# melt is not working as expected.
# There are 2 sets of sequential columns and both the sets are transposed to the same column
# NOT Working as EXPECTED

# df4_melted = df4.melt(id_vars=['id','name'], value_vars=['prem1','prem2','prem3','disc1','disc2','disc3'], var_name = 'month', value_name = 'values').sort_values('id').reset_index(drop='index')
# df4_melted

#### Another way to transform is to use the wide_to_long() panel data convenience function. It is less flexible than melt(), but more user-friendly.

In [11]:
df4_melted1 = pd.wide_to_long(df4, i=['id','name'], j='month', stubnames=['prem','disc'])
df4_melted1

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,prem,disc
id,name,month,Unnamed: 3_level_1,Unnamed: 4_level_1
1,a,1,100.0,20.0
1,a,2,,
1,a,3,300.0,50.0
2,b,1,280.0,40.0
2,b,2,180.0,30.0
2,b,3,,


In [12]:
df4_melted1.reset_index(inplace=True)
df4_melted1

Unnamed: 0,id,name,month,prem,disc
0,1,a,1,100.0,20.0
1,1,a,2,,
2,1,a,3,300.0,50.0
3,2,b,1,280.0,40.0
4,2,b,2,180.0,30.0
5,2,b,3,,


In [13]:
# Trying to see the usage of suffix= parameter. Not completed yet.
# df4_melted2 = pd.wide_to_long(df4, i=['id','name'], j='month', stubnames=['prem','disc'])#, suffix='1')
# df4_melted2

### df.stack()

In [14]:
df5 = pd.DataFrame({'id': [1,2],
                   'name': ['a','b'],
                   'prem1' : [100,280],
                   'prem2' : [np.NaN,180],
                   'prem3' : [300,np.NaN]})
df5

Unnamed: 0,id,name,prem1,prem2,prem3
0,1,a,100,,300.0
1,2,b,280,180.0,


In [24]:
df5.set_index(['id','name']).stack().reset_index()

Unnamed: 0,id,name,level_2,0
0,1,a,prem1,100.0
1,1,a,prem3,300.0
2,2,b,prem1,280.0
3,2,b,prem2,180.0


    > 1. Important thing to note - there is single series of variable (perm1 - perm3), which is transposed here.
    > 2. The index is set before the process of stacking.
    > 3. If there is multile sets of series of variables, then this would not work as expected.
    > 4. By default, dropna = True, and hence it drops the NaN values

In [15]:
df5.set_index(['id','name']).stack(dropna=False).reset_index()

Unnamed: 0,id,name,level_2,0
0,1,a,prem1,100.0
1,1,a,prem2,
2,1,a,prem3,300.0
3,2,b,prem1,280.0
4,2,b,prem2,180.0
5,2,b,prem3,


In [16]:
df6 = pd.DataFrame({'id': [1,2],
                   'name': ['a','b'],
                   'prem1' : [100,280],
                   'prem2' : [np.NaN,180],
                   'prem3' : [300,np.NaN],
                   'disc1' : [20,40],
                   'disc2' : [np.NaN,30],
                   'disc3' : [50,np.NaN]})
df6

Unnamed: 0,id,name,prem1,prem2,prem3,disc1,disc2,disc3
0,1,a,100,,300.0,20,,50.0
1,2,b,280,180.0,,40,30.0,


In [30]:
df6_stacked = df6.set_index(['id','name']).stack().reset_index()
df6_stacked

Unnamed: 0,id,name,level_2,0
0,1,a,prem1,100.0
1,1,a,prem3,300.0
2,1,a,disc1,20.0
3,1,a,disc3,50.0
4,2,b,prem1,280.0
5,2,b,prem2,180.0
6,2,b,disc1,40.0
7,2,b,disc2,30.0


In [27]:
# stack is not working as expected.
# There are 2 sets of sequential columns and both the sets are transposed to the same column
# NOT Working as EXPECTED

# Long to Wide DataFrame
Multiple records per ID to a single(one) record of each ID.

```python
1. pd.pivot()
2. Use df.set_index([id_vars columns and var_name columns]) and chain it with .unstack(level=2 (here))```

### pd.pivot()

In [45]:
df4_melted1

Unnamed: 0,id,name,month,prem,disc
0,1,a,1,100.0,20.0
1,1,a,2,,
2,1,a,3,300.0,50.0
3,2,b,1,280.0,40.0
4,2,b,2,180.0,30.0
5,2,b,3,,


In [44]:
df_wide = df4_melted1.set_index(['id','name','month'])
df_wide.columns

Index(['prem', 'disc'], dtype='object')

In [46]:
df_wide = df4_melted1.pivot(index=['id','name'], columns='month', values=['prem'])
df_wide

TypeError: MultiIndex.name must be a hashable type

### df.unstack() - 
#### Use df.set_index([id_vars columns and var_name columns]) and chain it with .unstack(level=2 (here))

In [18]:
wide_df = df4_melted1.set_index(['id','name','month']).unstack(level=2)
wide_df

Unnamed: 0_level_0,Unnamed: 1_level_0,prem,prem,prem,disc,disc,disc
Unnamed: 0_level_1,month,1,2,3,1,2,3
id,name,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
1,a,100.0,,300.0,20.0,,50.0
2,b,280.0,180.0,,40.0,30.0,


ID: level = 0; RegionVariable: level = 1; 'EXP': level = 2; 'ModelID': level = 3;

In [19]:
wide_df.columns

MultiIndex([('prem', 1),
            ('prem', 2),
            ('prem', 3),
            ('disc', 1),
            ('disc', 2),
            ('disc', 3)],
           names=[None, 'month'])

In [20]:
# Code to flatten the list and at the same time concatenating it.

wide_df.columns = ['_'.join(map(str, tup)) for tup in wide_df.columns] # Everything is back to the first dataframe

In [21]:
wide_df.columns

Index(['prem_1', 'prem_2', 'prem_3', 'disc_1', 'disc_2', 'disc_3'], dtype='object')

In [22]:
wide_df

Unnamed: 0_level_0,Unnamed: 1_level_0,prem_1,prem_2,prem_3,disc_1,disc_2,disc_3
id,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,a,100.0,,300.0,20.0,,50.0
2,b,280.0,180.0,,40.0,30.0,


In [23]:
wide_df.reset_index()

Unnamed: 0,id,name,prem_1,prem_2,prem_3,disc_1,disc_2,disc_3
0,1,a,100.0,,300.0,20.0,,50.0
1,2,b,280.0,180.0,,40.0,30.0,
