# DataFrame

### Construction
pandas的DataFrame可透過以下方法建立:
1. 單一Series物件  
2. dict的list物件
3. Series物件的dict
4. 2d-np array 
5. np結構陣列

In [1]:
import pandas as pd
import numpy as np

In [None]:
# 單一Series物件

In [2]:
x = pd.Series([1,2,3,4])
pd.DataFrame(x , columns=['A'])

Unnamed: 0,A
0,1
1,2
2,3
3,4


In [5]:
# dict的list物件
# 注意pd會自動將key轉成col name
x = [{'a':i,'b':2*i} for i in range(3)]
pd.DataFrame(x)

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4


In [7]:
# Series物件的dict
x = pd.Series([1,2,3,4])
y = pd.Series([7,8,9,10])
pd.DataFrame({'A':x , 'B':y})

Unnamed: 0,A,B
0,1,7
1,2,8
2,3,9
3,4,10


In [10]:
# 2d-np array
# 使用二維np array並指定col與index name
pd.DataFrame(np.random.rand(3,2) , columns=['MM','NN'] , index= [58,59,60])

Unnamed: 0,MM,NN
58,0.506048,0.510478
59,0.403463,0.553143
60,0.208655,0.095105


In [11]:
# np結構陣列
A = np.zeros(3,dtype=[('A','i8') , ('B','f8')])
A

array([(0, 0.), (0, 0.), (0, 0.)], dtype=[('A', '<i8'), ('B', '<f8')])

In [12]:
pd.DataFrame(A)

Unnamed: 0,A,B
0,0,0.0
1,0,0.0
2,0,0.0


### Pandas Object
如果把series看作為一維陣列，那麼datafrrame就是series的組合，視為二維陣列。(Excel csv file)  
每一column視為一個series，需為同樣資料型別;不同column間可以為不同資料型別(類似R的date.frame)  
同一column內如果出現不同資料型別，則該column的型別會顯示object!

In [35]:
# 建立dict物件，故意將column state內的第4個元素改成int
x = {'state': ['Ohio', 'Ohio', 'Ohio', 50 , 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'popu': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
x , type(x)

({'state': ['Ohio', 'Ohio', 'Ohio', 50, 'Nevada', 'Nevada'],
  'year': [2000, 2001, 2002, 2001, 2002, 2003],
  'popu': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]},
 dict)

In [36]:
# 建立pd Dataframe物件
X = pd.DataFrame(x)
type(X)

pandas.core.frame.DataFrame

In [37]:
# 注意state column消失，因為裡面有數字又有文字
X.describe()

Unnamed: 0,year,popu
count,6.0,6.0
mean,2001.5,2.55
std,1.048809,0.836062
min,2000.0,1.5
25%,2001.0,1.875
50%,2001.5,2.65
75%,2002.0,3.125
max,2003.0,3.6


In [39]:
# 可以看到state column的Dtype為object，其餘都是純i64 , f64
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   state   6 non-null      object 
 1   year    6 non-null      int64  
 2   popu    6 non-null      float64
dtypes: float64(1), int64(1), object(1)
memory usage: 272.0+ bytes


In [40]:
X

Unnamed: 0,state,year,popu
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,50,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [29]:
# 每一column視為pd Series
type(X.iloc[:,1])

pandas.core.series.Series

In [41]:
# 不同於R的dataframe或matrix，元素並不會強制轉型，索引後仍保持int型別
X.iloc[3,0] , type(X.iloc[3,0])

(50, int)

In [42]:
# 將dict轉成d.f.，這次使用正常資料
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'popu': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
print(f"data :\n{data}\n\n"
      f"data['state'] : {data['state']}\n\n"
      f"{pd.DataFrame(data)}")

data :
{'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'], 'year': [2000, 2001, 2002, 2001, 2002, 2003], 'popu': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

data['state'] : ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada']

    state  year  popu
0    Ohio  2000   1.5
1    Ohio  2001   1.7
2    Ohio  2002   3.6
3  Nevada  2001   2.4
4  Nevada  2002   2.9
5  Nevada  2003   3.2


In [6]:
# d.f.視為二維陣列，轉置操作同數學矩陣 (類似R的matrix型別)
# 須注意transpose後 rowname換到colname!
df = pd.DataFrame(data, index=['d','f','g','h','j','k'])
print(f'{df}\n\n{df.T}')

    state  year  popu
d    Ohio  2000   1.5
f    Ohio  2001   1.7
g    Ohio  2002   3.6
h  Nevada  2001   2.4
j  Nevada  2002   2.9
k  Nevada  2003   3.2

          d     f     g       h       j       k
state  Ohio  Ohio  Ohio  Nevada  Nevada  Nevada
year   2000  2001  2002    2001    2002    2003
popu    1.5   1.7   3.6     2.4     2.9     3.2


In [43]:
X = pd.Series(x)
X

state    [Ohio, Ohio, Ohio, 50, Nevada, Nevada]
year       [2000, 2001, 2002, 2001, 2002, 2003]
popu             [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]
dtype: object

In [44]:
X.shape

(3,)

In [45]:
X.size

3

In [47]:
# 每一column視為list
X[0] , type(X[0])

(['Ohio', 'Ohio', 'Ohio', 50, 'Nevada', 'Nevada'], list)

In [106]:
# d.f.可透過Series組合
area = pd.Series({'California': 423967, 'Texas': 695662,
                  'New York': 141297, 'Florida': 170312,
                  'Illinois': 149995})
popu = pd.Series({'California': 38332521, 'Texas': 26448193,
                 'New York': 19651127, 'Florida': 19552860,
                 'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'popu':popu})

print(f'data :\n{data}\n\n'
      f'data.area :\n{data.area}\n\n'
      f"data['area'] :\n{data['area']}\n\n"
      f'data.popu :\n{data.popu}\n\n'
      f"data['popu'] :\n{data['popu']}")

data :
              area      popu
California  423967  38332521
Texas       695662  26448193
New York    141297  19651127
Florida     170312  19552860
Illinois    149995  12882135

data.area :
California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

data['area'] :
California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

data.popu :
California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
Name: popu, dtype: int64

data['popu'] :
California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
Name: popu, dtype: int64


In [108]:
# 新增欄位(類似R的data.frame新增欄位的語法: df$z = df$x + df$y)
data['density'] = data['popu'] / data['area']
data.density = data.popu / data.area     # same
data

Unnamed: 0,area,popu,density
California,423967,38332521,90.413926
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [111]:
# 取出指定欄位可透過 df['name'] 或者 df.colname來讀取
# 須注意取出來的欄位為Series物件
print(f"data['area'] :\n{data['area']}\n\n"
      f"data.area :\n{data.area}\n\n"
      f"data.area.ndim\t: {data.area.ndim}\n\n"
      f"type(data.area)\t: {type(data.area)}")

data['area'] :
California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

data.area :
California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

data.area.ndim	: 1

type(data.area)	: <class 'pandas.core.series.Series'>


In [113]:
# 使用 df[['colname']] 取出來的為dataframe!
print(f"{data[['area']]}\n\n"
      f"data[['area']].ndim : {data[['area']].ndim}\n\n"
      f"{type(data[['area']])}")

              area
California  423967
Texas       695662
New York    141297
Florida     170312
Illinois    149995

data[['area']].ndim : 2

<class 'pandas.core.frame.DataFrame'>


### DataFrame Attributes
d.f.具備以下基本屬性:
>  index : 顯示rownames  
> columns : 顯示colnames  
> ndim : 顯示維度數(d.f.為2維)  
> size : 顯示d.f.所有元素個數  
> shape : 顯示d.f.是幾列幾行的資料框  


In [48]:
df = pd.DataFrame(data, index=['d','f','g','h','j','k'])

In [53]:
# d.f.讀取屬性
print(f'df :\n{df}\n\n'
      f'df.index\t= {df.index}\n'
      f'df.columns\t= {df.columns}\n\n'
      f'df.values =\n{df.values}\n\n'
      f'type(df.values)\t= {type(df.values)}\n'
      f'df.ndim\t\t= {df.ndim}\n'
      f'df.size\t\t= {df.size}\n'
      f'df.shape\t= {df.shape}')

df :
    state  year  popu
d    Ohio  2000   1.5
f    Ohio  2001   1.7
g    Ohio  2002   3.6
h  Nevada  2001   2.4
j  Nevada  2002   2.9
k  Nevada  2003   3.2

df.index	= Index(['d', 'f', 'g', 'h', 'j', 'k'], dtype='object')
df.columns	= Index(['state', 'year', 'popu'], dtype='object')

df.values =
[['Ohio' 2000 1.5]
 ['Ohio' 2001 1.7]
 ['Ohio' 2002 3.6]
 ['Nevada' 2001 2.4]
 ['Nevada' 2002 2.9]
 ['Nevada' 2003 3.2]]

type(df.values)	= <class 'numpy.ndarray'>
df.ndim		= 2
df.size		= 18
df.shape	= (6, 3)


In [51]:
# 注意index和columns屬性回傳的是pd的Index型別物件
type(df.index) , type(df.columns)

(pandas.core.indexes.base.Index, pandas.core.indexes.base.Index)

### DataFrame Inspection
d.f.可透過以下方法進行資料檢視:
> describe() : 顯示d.f.內Index,Column個數,Column是否包含null空值,欄位資料型態,占用記憶體...etc  
> info() : 顯示各欄位(資料型態必須一致)的總數,平均,標準差,最大最小值與25%/50%/75%百分位數  
> head(value)/tail(value) : 顯示頭/尾幾筆資料，不給value值的話預設5筆  

In [55]:
# 顯示d.f.部分資料與基礎資訊
print(f'df :\n{df}\n\n'
      f'df.describe() :\n{df.describe()} end\n\n'
      f'df.info :\n{df.info()}\n\n'
      f'df.head() :\n{df.head()}\n\n'
      f'df.tail(6) :\n{df.tail(6)}\n')

<class 'pandas.core.frame.DataFrame'>
Index: 6 entries, d to k
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   state   6 non-null      object 
 1   year    6 non-null      int64  
 2   popu    6 non-null      float64
dtypes: float64(1), int64(1), object(1)
memory usage: 192.0+ bytes
df :
    state  year  popu
d    Ohio  2000   1.5
f    Ohio  2001   1.7
g    Ohio  2002   3.6
h  Nevada  2001   2.4
j  Nevada  2002   2.9
k  Nevada  2003   3.2

df.describe() :
              year      popu
count     6.000000  6.000000
mean   2001.500000  2.550000
std       1.048809  0.836062
min    2000.000000  1.500000
25%    2001.000000  1.875000
50%    2001.500000  2.650000
75%    2002.000000  3.125000
max    2003.000000  3.600000 end

df.info :
None

df.head() :
    state  year  popu
d    Ohio  2000   1.5
f    Ohio  2001   1.7
g    Ohio  2002   3.6
h  Nevada  2001   2.4
j  Nevada  2002   2.9

df.tail(6) :
    state  year  popu
d    Ohio  2000

### Column & Row 
透過rename()、set_axis()方法或columns屬性，達到全修改與部分修改column name 的功能  
根據Pandas API文件 https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rename.html  
> `.rename(index=None, columns=None, axis=None, inplace=False)`  

其中:
1. index可設定rowname，使用dict做mapping  
2. columns可設定欄位名稱，使用dict做mapping  
3. axis預設為0(row維度)  
4. inplace預設為False，代表是否要置換原資料  

根據Pandas API文件 https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.set_axis.html?highlight=set_axis#pandas.DataFrame.set_axis  
> `.set_axis(labels, axis=0, inplace=False)`  

其中:
1. labels為list-like 或者 Index物件  
2. axis預設為0，可設定1或者columns,index  
3. inplace為布林值，可決定是否要更改原資料 

In [59]:
# 定義初始d.f.
df = pd.DataFrame({'team':['A', 'A', 'A', 'A', 'B', 'B', 'B', 'B'],
                   'points': [25, 12, 15, 14, 19, 23, 25, 29],
                   'assists': [5, 7, 7, 9, 12, 9, 9, 4],
                   'rebounds': [11, 8, 10, 6, 6, 5, 9, 12]})
df

Unnamed: 0,team,points,assists,rebounds
0,A,25,5,11
1,A,12,7,8
2,A,15,7,10
3,A,14,9,6
4,B,19,12,6
5,B,23,9,5
6,B,25,9,9
7,B,29,4,12


##### 顯示column name 方法1

In [60]:
# 注意結果回傳list
list(df)

['team', 'points', 'assists', 'rebounds']

##### 顯示column name 方法2

In [62]:
# 注意結果回傳Index型別物件
df.columns

Index(['team', 'points', 'assists', 'rebounds'], dtype='object')

##### 重新命名column name 方法1

In [93]:
# 注意格式為dict，使用key:value做前後對應，例如: 變更前:變更後
# 注意這個方法可以選擇部分column變更
df.rename(columns = {'team':'team_name', 'points':'points_scored'}, inplace = True)
df

Unnamed: 0,BBAteam,BBApoints,BBAassists,BBArebounds
0,A,25,5,11
1,A,12,7,8
2,A,15,7,10
3,A,14,9,6
4,B,19,12,6
5,B,23,9,5
6,B,25,9,9
7,B,29,4,12


##### 重新命名column name 方法2

In [66]:
# 使用讀取屬性+賦值的方法變更欄位值
df.columns = ['new_col1', 'new_col2', 'new_col3', 'new_col4']
df

Unnamed: 0,new_col1,new_col2,new_col3,new_col4
0,A,25,5,11
1,A,12,7,8
2,A,15,7,10
3,A,14,9,6
4,B,19,12,6
5,B,23,9,5
6,B,25,9,9
7,B,29,4,12


##### 重新命名column name 方法3

In [92]:
# 注意這個方法只挑選部分欄位做改變
# 如果沒有對應的欄位名稱，不會報錯
df.columns.str

<pandas.core.strings.accessor.StringMethods at 0x273bf8ddab0>

In [70]:
df.columns = df.columns.str.replace('new_col1', 'XXXX')
df

Unnamed: 0,XXXX,new_col2,new_col3,new_col4
0,A,25,5,11
1,A,12,7,8
2,A,15,7,10
3,A,14,9,6
4,B,19,12,6
5,B,23,9,5
6,B,25,9,9
7,B,29,4,12


##### 重新命名column name 方法4

In [91]:
# 注意原資料的column具備相同模式的開頭符號
df = pd.DataFrame({'BBAteam':['A', 'A', 'A', 'A', 'B', 'B', 'B', 'B'],
                   'BBApoints': [25, 12, 15, 14, 19, 23, 25, 29],
                   'BBAassists': [5, 7, 7, 9, 12, 9, 9, 4],
                   'BBArebounds': [11, 8, 10, 6, 6, 5, 9, 12]})
list(df)

['BBAteam', 'BBApoints', 'BBAassists', 'BBArebounds']

In [90]:
# 針對欄位名稱內的特定字串做置換
df.columns = df.columns.str.replace('BBA', '000')
list(df)

['000team', '000points', '000assists', '000rebounds']

  df.columns = df.columns.str.replace('?', '000')


['000team', '000points', '000assists', '000rebounds']

##### 重新命名column name 方法5

In [101]:
df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})

In [96]:
# 指定axis為設定index
df.set_axis(['a', 'b', 'c'], axis='index')

Unnamed: 0,A,B
a,1,4
b,2,5
c,3,6


In [102]:
# 指定axis為設定column name
df.set_axis(['I', 'II'], axis='columns')

Unnamed: 0,I,II
0,1,4
1,2,5
2,3,6


In [103]:
df

Unnamed: 0,A,B
0,1,4
1,2,5
2,3,6


In [105]:
# 利用inplace變更原資料
df.set_axis(['005', '007'], axis='columns', inplace=True)
df

Unnamed: 0,005,007
0,1,4
1,2,5
2,3,6
