In [9]:
#資料結構運用
##二維資料(2 levels)
import numpy as np
import pandas as pd

df1 = pd.DataFrame({'key1':['a', 'a', 'b', 'b', 'a'],
                  'key2':['one', 'two', 'one', 'two', 'one'],
                  'data1': np.random.randn(5), #randn(): return a sample(or samples) from the "standard normal" distribution
                  'data2':np.random.randn(5)})
df1

Unnamed: 0,key1,key2,data1,data2
0,a,one,0.741093,-0.520964
1,a,two,0.740655,1.14921
2,b,one,-0.273284,2.613342
3,b,two,-0.547246,-0.390548
4,a,one,1.191453,-0.75797


In [10]:
##將欄位設為index使其成為三維資料(3 levels)，並設定column level的名稱
'''可透過DataFrame物件名稱.set_index(keys)此method來將欄位改為index，會回傳DataFrame: 
Set the DataFrame index using existing columns.
Set the DataFrame index (row labels) using one or more existing columns or arrays (of the correct length). 
The index can replace the existing index or expand on it.
其中參數keys可指定要改為index的columns，可為label or array-like or list of labels/arrays。
注意：系統會自動將原欄位名稱設定為index level名稱。'''

df2 = df1.set_index(keys = ['key1','key2'])
display(df2)
df2.columns.name = 'data'
display(df2)

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,0.741093,-0.520964
a,two,0.740655,1.14921
b,one,-0.273284,2.613342
b,two,-0.547246,-0.390548
a,one,1.191453,-0.75797


Unnamed: 0_level_0,data,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,0.741093,-0.520964
a,two,0.740655,1.14921
b,one,-0.273284,2.613342
b,two,-0.547246,-0.390548
a,one,1.191453,-0.75797


In [12]:
##相同資料處理
###相同資料做unstack會顯示錯誤
'''注意：在做unstack時，若系統偵測到有相同資料則會顯示錯誤: ValueError: Index contains duplicate entries, cannot reshape。
以此處的df1做說明，先將上方的df1做stack(將唯一的column level轉為index level，產生一新3 levels Series)不會出錯，
而此時再將該Series做unstack，則會顯示錯誤，原因是該Series(df1)中有重複的資料: key1為a且key2為one的資料有兩筆。
而要解決此問題的方法即是將相同資料的筆數整合成為一筆資料。'''

display(df2.stack())
display(df2.unstack())  #會顯示錯誤(ValueError: Index contains duplicate entries, cannot reshape)


key1  key2  data 
a     one   data1    0.741093
            data2   -0.520964
      two   data1    0.740655
            data2    1.149210
b     one   data1   -0.273284
            data2    2.613342
      two   data1   -0.547246
            data2   -0.390548
a     one   data1    1.191453
            data2   -0.757970
dtype: float64

ValueError: Index contains duplicate entries, cannot reshape

In [13]:
###相同資料整合
'''可透過DataFrame/Series物件名稱.groupby()此method將相同資料的比數整合為一筆資料，會回傳DataFrameGroupBy物件：
Group DataFrame using a mapper or by a Series of columns.
A groupby operation involves some combination of splitting the object, applying a function, and combining the results. 
This can be used to group large amounts of data and compute operations on these groups.
可加入參數by來指定進行分組的規則，可為mapping, function, label, pd.Grouper or list of such:
Used to determine the groups for the groupby；
參數level來指定要依按照哪個level進行分組，可為int, level name, or sequence of such,預設為None: 
If the axis is a MultiIndex (hierarchical), group by a particular level or levels. Do not specify both by and level.
說明：
此method分組過程為先依據規則進行拆解(split) --> 再把每個分組運行計算規則處理(apply) --> 最後再將每個分組的計算結果進行合併(combine)，
如此處範例中的df1.groupby(level = ["key1","key2"]).mean()即是將資料依據key1和key2 level(兩個level一起)的規則進行拆解，
此時的顯示結果為<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fe5c0f1cd90>，
接著進行.mean()此method的語法，會回傳DataFrame(Compute mean of groups, excluding missing values.)，
即將剛剛依據規則進行拆解的每個組進行平均值計算，也就是此時有相同資料的筆數會被視為同一組進行運算，最後再將這些分組合併再一起。
注意：實際操作資料時一定會有許多重複的資料，故groupby會是很重要的一個步驟。'''

df3 = df2.groupby(level = ["key1","key2"]).mean()
display(df3)
display(df3.unstack(level = "key1"))  #沒有重複資料即可做樞紐分析unstack

Unnamed: 0_level_0,data,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,0.966273,-0.639467
a,two,0.740655,1.14921
b,one,-0.273284,2.613342
b,two,-0.547246,-0.390548


data,data1,data1,data2,data2
key1,a,b,a,b
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
one,0.966273,-0.273284,-0.639467,2.613342
two,0.740655,-0.547246,1.14921,-0.390548


In [17]:
##groupby運用
'''另外，若是想將columns改為index，除了可透過set_index()外，也可透過groupby()來執行。'''

display(df1)
df4 = df1.groupby(by = ["key1","key2"]).sum()
display(df4)

Unnamed: 0,key1,key2,data1,data2
0,a,one,0.741093,-0.520964
1,a,two,0.740655,1.14921
2,b,one,-0.273284,2.613342
3,b,two,-0.547246,-0.390548
4,a,one,1.191453,-0.75797


Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,1.932547,-1.278935
a,two,0.740655,1.14921
b,one,-0.273284,2.613342
b,two,-0.547246,-0.390548
