<img src="https://docs.google.com/uc?export=download&id=1EiHSYfHYk8nKMEWd6A74CMFVak5Lf4ab">
# Data manipulation:  Data wrangling, aggeregation, and group operations.

# 1- Hierarchical Indexing

* **Data wrangling** is the process of **cleaning** and **unifying messy ** and **complex** data sets for **easy access** and **analysis**. (from: https://www.datawatch.com/what-is-data-wrangling/)
* **Hierarchical indexing**: is the use of **multiple** indexes at 
different** levels**

In [0]:
# importing necessary libraries
import pandas as pd
from pandas import Series as S, DataFrame as DF
import numpy as np


In [250]:
# creating a Series with a hirearchical index
hind = [list("AAABBBCCCD"),["i1","i2","i3"]*3 +["i1"]]
ser1 = S(range(10), index= hind)
ser1


A  i1    0
   i2    1
   i3    2
B  i1    3
   i2    4
   i3    5
C  i1    6
   i2    7
   i3    8
D  i1    9
dtype: int64

In [251]:
# the index given as argument when creating the series
hind

[['A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C', 'D'],
 ['i1', 'i2', 'i3', 'i1', 'i2', 'i3', 'i1', 'i2', 'i3', 'i1']]

In [252]:
ser1.index

MultiIndex(levels=[['A', 'B', 'C', 'D'], ['i1', 'i2', 'i3']],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 2, 3], [0, 1, 2, 0, 1, 2, 0, 1, 2, 0]])

## Reordering an sorting

* Reordering  enables **interchanging the index levels** using the **swaplevel** method
* Sorting enables** sorting** the  **data** by sorting** one level values**, using the **sort_index** method.

In [253]:
# naming the levels
ser1.index.names=["the_level0","the_level1"]
# rearanging the levels
ser1.swaplevel("the_level0","the_level1")

the_level1  the_level0
i1          A             0
i2          A             1
i3          A             2
i1          B             3
i2          B             4
i3          B             5
i1          C             6
i2          C             7
i3          C             8
i1          D             9
dtype: int64

In [254]:
# sorting the values following the second level : level=1
ser1.sort_index(level=1)

the_level0  the_level1
A           i1            0
B           i1            3
C           i1            6
D           i1            9
A           i2            1
B           i2            4
C           i2            7
A           i3            2
B           i3            5
C           i3            8
dtype: int64

## operations by level


In [255]:
hind2=[["hex","hex","hex","rgb","rgb","rgb"],[1,2,3,1,2,3]]
# creating a new DataFrame
df1 = DF({"colors":["green","blue","red","green","blue","red"],"codes":["#FF0000","#0000FF","#FF0000",(0,0,255),(0,255,0),(255,0,0)]}, index=hind2,columns=["colors","codes"])
df1.index.names=["Code_type","number"]
df1

Unnamed: 0_level_0,Unnamed: 1_level_0,colors,codes
Code_type,number,Unnamed: 2_level_1,Unnamed: 3_level_1
hex,1,green,#FF0000
hex,2,blue,#0000FF
hex,3,red,#FF0000
rgb,1,green,"(0, 0, 255)"
rgb,2,blue,"(0, 255, 0)"
rgb,3,red,"(255, 0, 0)"


In [256]:
# summrising the columns values by "code_type"
df1.sum(level="Code_type")

Unnamed: 0_level_0,colors,codes
Code_type,Unnamed: 1_level_1,Unnamed: 2_level_1
hex,greenbluered,#FF0000#0000FF#FF0000
rgb,greenbluered,"(0, 0, 255, 0, 255, 0, 255, 0, 0)"


## indexing

In [257]:
df1["value"]= [5,7,8,10,3,1]
# sort the df1 values according to the second index
df1= df1.sort_index(level=1)
df1

Unnamed: 0_level_0,Unnamed: 1_level_0,colors,codes,value
Code_type,number,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hex,1,green,#FF0000,5
rgb,1,green,"(0, 0, 255)",10
hex,2,blue,#0000FF,7
rgb,2,blue,"(0, 255, 0)",3
hex,3,red,#FF0000,8
rgb,3,red,"(255, 0, 0)",1


In [258]:
# creating a new DataFrame using df1 columns
df2 =df1.set_index(["colors","codes"])
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,value
colors,codes,Unnamed: 2_level_1
green,#FF0000,5
green,"(0, 0, 255)",10
blue,#0000FF,7
blue,"(0, 255, 0)",3
red,#FF0000,8
red,"(255, 0, 0)",1


In [259]:
# the indexes are converted into columns
df2.reset_index()

Unnamed: 0,colors,codes,value
0,green,#FF0000,5
1,green,"(0, 0, 255)",10
2,blue,#0000FF,7
3,blue,"(0, 255, 0)",3
4,red,#FF0000,8
5,red,"(255, 0, 0)",1


# 2- Combining and merging Data Sets

## merge

In [260]:
df1

Unnamed: 0_level_0,Unnamed: 1_level_0,colors,codes,value
Code_type,number,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hex,1,green,#FF0000,5
rgb,1,green,"(0, 0, 255)",10
hex,2,blue,#0000FF,7
rgb,2,blue,"(0, 255, 0)",3
hex,3,red,#FF0000,8
rgb,3,red,"(255, 0, 0)",1


In [261]:
df3 = DF({"mycolors":["green","blue"],"codes":["G","B"]})
df3

Unnamed: 0,codes,mycolors
0,G,green
1,B,blue


In [262]:
# merging two dataframes df1 and df3, using the common values
#in "colors" and "mycolors" columns
pd.merge(df1,df3,left_on="colors",right_on="mycolors")

Unnamed: 0,colors,codes_x,value,codes_y,mycolors
0,green,#FF0000,5,G,green
1,green,"(0, 0, 255)",10,G,green
2,blue,#0000FF,7,B,blue
3,blue,"(0, 255, 0)",3,B,blue


In [263]:
# merging two dataframes df1 and df3, using the common values
#in "colors" and "mycolors" columns
#keeping all df1 values (even if they are missing in df3)
pd.merge(df1,df3,left_on="colors",right_on="mycolors",suffixes=("_df1","_df3"),how="left")

Unnamed: 0,colors,codes_df1,value,codes_df3,mycolors
0,green,#FF0000,5,G,green
1,green,"(0, 0, 255)",10,G,green
2,blue,#0000FF,7,B,blue
3,blue,"(0, 255, 0)",3,B,blue
4,red,#FF0000,8,,
5,red,"(255, 0, 0)",1,,


In [264]:
# reseting the index of df1
df11=df1.reset_index()
# using df3 indexes values as common values for merge
pd.merge(df11,df3,left_on="number", right_index=True)


Unnamed: 0,Code_type,number,colors,codes_x,value,codes_y,mycolors
0,hex,1,green,#FF0000,5,B,blue
1,rgb,1,green,"(0, 0, 255)",10,B,blue


## join

In [265]:
df11

Unnamed: 0,Code_type,number,colors,codes,value
0,hex,1,green,#FF0000,5
1,rgb,1,green,"(0, 0, 255)",10
2,hex,2,blue,#0000FF,7
3,rgb,2,blue,"(0, 255, 0)",3
4,hex,3,red,#FF0000,8
5,rgb,3,red,"(255, 0, 0)",1


In [266]:
# combining df11 and df3 by common indexes values
df11.join(df3,lsuffix="_df11",rsuffix="_df3")


Unnamed: 0,Code_type,number,colors,codes_df11,value,codes_df3,mycolors
0,hex,1,green,#FF0000,5,G,green
1,rgb,1,green,"(0, 0, 255)",10,B,blue
2,hex,2,blue,#0000FF,7,,
3,rgb,2,blue,"(0, 255, 0)",3,,
4,hex,3,red,#FF0000,8,,
5,rgb,3,red,"(255, 0, 0)",1,,


## concat

In [267]:
ser11=S(range(2),index=list("ab"))
ser11

a    0
b    1
dtype: int64

In [268]:
ser2 = S(range(7,10),index=list("cde"))
ser2

c    7
d    8
e    9
dtype: int64

In [269]:
# concatenating ser11 and ser2 by concatenating the indexes
pd.concat([ser11,ser2])

a    0
b    1
c    7
d    8
e    9
dtype: int64

## combine first

In [270]:
ser3 =S(range(3),index=list("abc"))
ser3

a    0
b    1
c    2
dtype: int64

In [271]:
ser4= S(range(7,10),index=list("abd"))
ser4

a    7
b    8
d    9
dtype: int64

In [272]:
# select first ser3 values 
# for common indexes values
ser3.combine_first(ser4)

a    0.0
b    1.0
c    2.0
d    9.0
dtype: float64

In [273]:
ser3["a"]=np.nan
ser3

a    NaN
b    1.0
c    2.0
dtype: float64

In [274]:
# if a values is from ser3 is nan
# you can select the corresponding value
# in ser4
ser3.combine_first(ser4)

a    7.0
b    1.0
c    2.0
d    9.0
dtype: float64

# 3- Reshaping and pivoting

## stack and unstack
* **stack**: pivot **columns** label  to **rows** indexes
* **unstack**: pivot **rows ** indexes to  **columns** labels

In [275]:
df11

Unnamed: 0,Code_type,number,colors,codes,value
0,hex,1,green,#FF0000,5
1,rgb,1,green,"(0, 0, 255)",10
2,hex,2,blue,#0000FF,7
3,rgb,2,blue,"(0, 255, 0)",3
4,hex,3,red,#FF0000,8
5,rgb,3,red,"(255, 0, 0)",1


In [276]:
df11.stack()

0  Code_type            hex
   number                 1
   colors             green
   codes            #FF0000
   value                  5
1  Code_type            rgb
   number                 1
   colors             green
   codes        (0, 0, 255)
   value                 10
2  Code_type            hex
   number                 2
   colors              blue
   codes            #0000FF
   value                  7
3  Code_type            rgb
   number                 2
   colors              blue
   codes        (0, 255, 0)
   value                  3
4  Code_type            hex
   number                 3
   colors               red
   codes            #FF0000
   value                  8
5  Code_type            rgb
   number                 3
   colors               red
   codes        (255, 0, 0)
   value                  1
dtype: object

In [277]:
df11.stack().unstack()

Unnamed: 0,Code_type,number,colors,codes,value
0,hex,1,green,#FF0000,5
1,rgb,1,green,"(0, 0, 255)",10
2,hex,2,blue,#0000FF,7
3,rgb,2,blue,"(0, 255, 0)",3
4,hex,3,red,#FF0000,8
5,rgb,3,red,"(255, 0, 0)",1


In [278]:
ser1

the_level0  the_level1
A           i1            0
            i2            1
            i3            2
B           i1            3
            i2            4
            i3            5
C           i1            6
            i2            7
            i3            8
D           i1            9
dtype: int64

In [279]:
ser1.unstack()

the_level1,i1,i2,i3
the_level0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.0,1.0,2.0
B,3.0,4.0,5.0
C,6.0,7.0,8.0
D,9.0,,


## pivot (long to wide)

In [280]:
df11

Unnamed: 0,Code_type,number,colors,codes,value
0,hex,1,green,#FF0000,5
1,rgb,1,green,"(0, 0, 255)",10
2,hex,2,blue,#0000FF,7
3,rgb,2,blue,"(0, 255, 0)",3
4,hex,3,red,#FF0000,8
5,rgb,3,red,"(255, 0, 0)",1


In [281]:
df11.pivot(index="Code_type",columns="number",values="value")

number,1,2,3
Code_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
hex,5,7,8
rgb,10,3,1


## melt (wide to long)

In [282]:
df3 = DF([[1,2],[3,4]],columns=["col1","col2"])
df3

Unnamed: 0,col1,col2
0,1,2
1,3,4


In [283]:
# melting all the columns in 2 columns "variable" and "value"
pd.melt(df3)

Unnamed: 0,variable,value
0,col1,1
1,col1,3
2,col2,2
3,col2,4


# 4- Group by Mechanics

## groupby

In [284]:
df11

Unnamed: 0,Code_type,number,colors,codes,value
0,hex,1,green,#FF0000,5
1,rgb,1,green,"(0, 0, 255)",10
2,hex,2,blue,#0000FF,7
3,rgb,2,blue,"(0, 255, 0)",3
4,hex,3,red,#FF0000,8
5,rgb,3,red,"(255, 0, 0)",1


In [287]:
# gouping values having same Code_type, 
# and applying on them sum method
df11["value"].groupby(df11["Code_type"]).sum()

Code_type
hex    20
rgb    14
Name: value, dtype: int64

In [331]:
# iterating over a group
for i,j in df11["value"].groupby(df11["Code_type"]):
  print (i)
  print (j)


hex
0    5
2    7
4    8
Name: value, dtype: int64
rgb
1    10
3     3
5     1
Name: value, dtype: int64


In [323]:
df11["value2"]=[10,11,12,13,14,15]
df11["value3"]=[1]*6
df11

Unnamed: 0,Code_type,number,colors,codes,value,value2,value3
0,hex,1,green,#FF0000,5,10,1
1,rgb,1,green,"(0, 0, 255)",10,11,1
2,hex,2,blue,#0000FF,7,12,1
3,rgb,2,blue,"(0, 255, 0)",3,13,1
4,hex,3,red,#FF0000,8,14,1
5,rgb,3,red,"(255, 0, 0)",1,15,1


In [324]:
myDict={"number":"Gr1","value":"Gr1","value2":"Gr2","value3":"Gr2"}
myDict

{'number': 'Gr1', 'value': 'Gr1', 'value2': 'Gr2', 'value3': 'Gr2'}

In [325]:
# grouping number an value values in Gr1
# and grouping colors and codes in Gr2
# then applya a sum on the grouped values
df11.groupby(myDict,axis=1).sum()



Unnamed: 0,Gr1,Gr2
0,6,11
1,11,12
2,9,13
3,5,14
4,11,15
5,4,16


# 5- Data aggregation

## agg

In [326]:
df11

Unnamed: 0,Code_type,number,colors,codes,value,value2,value3
0,hex,1,green,#FF0000,5,10,1
1,rgb,1,green,"(0, 0, 255)",10,11,1
2,hex,2,blue,#0000FF,7,12,1
3,rgb,2,blue,"(0, 255, 0)",3,13,1
4,hex,3,red,#FF0000,8,14,1
5,rgb,3,red,"(255, 0, 0)",1,15,1


In [0]:
my_group=df11["value"].groupby(df11["Code_type"])
my_group2=df11.groupby(df11["Code_type"],as_index=False)


In [361]:
# applying multiple aggregation operations on
# the grouoped values
my_group.agg(["sum","prod"])

Unnamed: 0_level_0,sum,prod
Code_type,Unnamed: 1_level_1,Unnamed: 2_level_1
hex,20,280
rgb,14,30


In [348]:
my_group.agg(["sum","prod"])

Unnamed: 0_level_0,sum,prod
Code_type,Unnamed: 1_level_1,Unnamed: 2_level_1
hex,20,280
rgb,14,30


## Columns kept columns

#  6- Other aggregation operations 

## cut & qcut

In [381]:
df11["value3"]=[2,31,62,156,230,1000]
df11

Unnamed: 0,Code_type,number,colors,codes,value,value2,value3
0,hex,1,green,#FF0000,5,10,2
1,rgb,1,green,"(0, 0, 255)",10,11,31
2,hex,2,blue,#0000FF,7,12,62
3,rgb,2,blue,"(0, 255, 0)",3,13,156
4,hex,3,red,#FF0000,8,14,230
5,rgb,3,red,"(255, 0, 0)",1,15,1000


In [383]:
# grouping value3 by intervals of the same lenght
intervals=pd.cut(df11.value3,3)
my_group4= df11["value3"].groupby(intervals)
my_group4.count()

value3
(1.002, 334.667]      5
(334.667, 667.333]    0
(667.333, 1000.0]     1
Name: value3, dtype: int64

In [386]:
# grouping value3 by intervals of the same size
# same number of values in each interval
intervals=pd.qcut(df11.value3,3)
my_group5= df11["value3"].groupby(intervals)
my_group5.count()

value3
(1.999, 51.667]      2
(51.667, 180.667]    2
(180.667, 1000.0]    2
Name: value3, dtype: int64

## crosstab

In [387]:
df11

Unnamed: 0,Code_type,number,colors,codes,value,value2,value3
0,hex,1,green,#FF0000,5,10,2
1,rgb,1,green,"(0, 0, 255)",10,11,31
2,hex,2,blue,#0000FF,7,12,62
3,rgb,2,blue,"(0, 255, 0)",3,13,156
4,hex,3,red,#FF0000,8,14,230
5,rgb,3,red,"(255, 0, 0)",1,15,1000


In [396]:
# for each value in column "number",
# crosstab will calculate the frequnecies
# of each unique value in "colors"

pd.crosstab(df11.number,df11.colors)

colors,blue,green,red
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,2,0
2,2,0,0
3,0,0,2


# References
* Datawatch. What is data wrangling? On-line at https://www.datawatch.com/what-is-data-wrangling/. Accessed on 31-10-2018.
* Wes McKinney. Python for data analysis: Data wrangling with Pandas, NumPy, and IPython. O’Reilly Media, Inc, 2018.
pydata.org.
* Pandas documentation. On-line at https://pandas.pydata. org/. Accessed on 19-10-2018.