# Pandas

Pandas具有兩個資料型態<br>
<li>series<br>
<li>dataframe

## series 基礎
### 索引默認為數字，可自行定義


In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.Series([0.25,0.5,0.75,1.0])
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [6]:
data = pd.Series([0.25,0.5,0.75,1.0])
data[0]

0.25

In [7]:
data = pd.Series([0.25,0.5,0.75,1.0],index=['a','b','c','d'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [8]:
data['a']

0.25

In [15]:
list1 = [0.25,0.5,0.75,1.0]
dic1 = {'high':180,'Age':18}
data1 = pd.Series(list1)
data2 = pd.Series(dic1)

In [16]:
print(type(data1))
data1

<class 'pandas.core.series.Series'>


0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [17]:
print(type(data2))
data2

<class 'pandas.core.series.Series'>


high    180
Age      18
dtype: int64

In [20]:
print(type(data2['high':'Age']))
data2['high':'Age']                 #支援[:]形式的dictionary

<class 'pandas.core.series.Series'>


high    180
Age      18
dtype: int64

In [21]:
#純量會被延伸
data = pd.Series(5,index=[100,200,300])
data

100    5
200    5
300    5
dtype: int64

In [23]:
#以字典創建的結果可被index指定
data = pd.Series({1:1,2:2,5:5,4:4})
data

1    1
2    2
5    5
4    4
dtype: int64

In [24]:
data = pd.Series({1:1,2:2,5:5,4:4},index=[2,5,1,4])
data

2    2
5    5
1    1
4    4
dtype: int64

In [25]:
data = pd.Series({1:1,2:2,5:5,4:4},index=[2,5])
data

2    2
5    5
dtype: int64

In [27]:
data = pd.Series({1:1,2:2,5:5,4:4},index=[2,3])
data

2    2.0
3    NaN
dtype: float64

In [None]:
#  先確定index，再填入對應的數字

### Methods

In [31]:
print(type(data.values))
data.values

<class 'numpy.ndarray'>


array([ 2., nan])

In [33]:
print(type(data.index))
data.index 

<class 'pandas.core.indexes.numeric.Int64Index'>


Int64Index([2, 3], dtype='int64')

In [None]:
DataFrame
Series的聚集

In [35]:
area_dict = {'California':12345,'Florida':44444,'Illinois':65432,'New_York':191919}
popu_dict = {'California':3333,'Florida':5555,'Illinois':2222,'New_York':4444}
area = pd.Series(area_dict)
popu = pd.Series(popu_dict)

In [40]:
#從Series組合
state = pd.DataFrame({'AreA':area,'PopU':popu})
state

Unnamed: 0,AreA,PopU
California,12345,3333
Florida,44444,5555
Illinois,65432,2222
New_York,191919,4444


In [43]:
#直接從Dict組合
state2 = pd.DataFrame({'AreA':area_dict,'PopU':popu_dict})
state2

Unnamed: 0,AreA,PopU
California,12345,3333
Florida,44444,5555
Illinois,65432,2222
New_York,191919,4444


In [51]:
#從單一dict建立
state3 = pd.DataFrame(area_dict,['area'])   #( data(dict) + column_name , row_name )
state3

Unnamed: 0,California,Florida,Illinois,New_York
area,12345,44444,65432,191919


In [60]:
#從 list 建立
state3 = pd.DataFrame([area_dict,popu_dict],['area','popu'])   #( data(dict) + column_name , *row_name )
state3

Unnamed: 0,California,Florida,Illinois,New_York
area,12345,44444,65432,191919
popu,3333,5555,2222,4444


In [62]:
#缺少的值會自動填入 NaN
popu_dict2={'California':3333,'Florida':5555}
state3 = pd.DataFrame([area_dict,popu_dict2],['area','popu'])   #( data(dict) + column_name , *row_name )
state3

Unnamed: 0,California,Florida,Illinois,New_York
area,12345,44444,65432.0,191919.0
popu,3333,5555,,


In [57]:
#從單一 Series 建立
state4 = pd.DataFrame(area,columns=['area'])   #( data(Series) + row_name , column_name )
state4

Unnamed: 0,area
California,12345
Florida,44444
Illinois,65432
New_York,191919


In [66]:
#以多維array建立
pd.DataFrame(np.random.rand(3,2))

Unnamed: 0,0,1
0,0.146333,0.903642
1,0.645514,0.739608
2,0.329751,0.727149


In [67]:
#可以columns、index參數添名稱
pd.DataFrame(np.random.rand(3,2),
            columns=["AaA",'BbB'],
            index=['a','b','c'])

Unnamed: 0,AaA,BbB
a,0.774564,0.927512
b,0.540727,0.066432
c,0.012142,0.12952


In [69]:
#從結構化np.array資料建立
A = np.zeros(3,dtype=[('A','i8'),('b','f8')])
A

array([(0, 0.), (0, 0.), (0, 0.)], dtype=[('A', '<i8'), ('b', '<f8')])

In [70]:
pd.DataFrame(A)       #  (結構化中 自帶 column_name)

Unnamed: 0,A,b
0,0,0.0
1,0,0.0
2,0,0.0


### 檢視index的方法

In [52]:
state.index

Index(['California', 'Florida', 'Illinois', 'New_York'], dtype='object')

In [53]:
state.columns

Index(['AreA', 'PopU'], dtype='object')

### Index 
可視為  有序的不可被修改陣列<br>
size、shape、ndim、dtype<br>
也具有集合的功用<br>
&、|、^

In [87]:
ind1 = pd.Index(['California', 'Florida', 'Illinois', 'New_York',2])
ind1

Index(['California', 'Florida', 'Illinois', 'New_York', 2], dtype='object')

In [88]:
ind2 = pd.Index([2,3,4,4,5,6])
ind2

Int64Index([2, 3, 4, 4, 5, 6], dtype='int64')

In [89]:
print('size = ',ind2.size)
print('shape = ',ind2.shape)
print('ndim = ',ind2.ndim)
print('dtype = ',ind2.dtype)

size =  6
shape =  (6,)
ndim =  1
dtype =  int64


In [90]:
ind1&ind2

Index([2], dtype='object')

In [91]:
ind1.intersection(ind2)

Index([2], dtype='object')

In [92]:
ind1|ind2

  return self.union(other)


Index(['California', 'Florida', 'Illinois', 'New_York', 2, 3, 4, 4, 5, 6], dtype='object')

In [94]:
ind1.union(ind2)

  """Entry point for launching an IPython kernel.


Index(['California', 'Florida', 'Illinois', 'New_York', 2, 3, 4, 4, 5, 6], dtype='object')

In [93]:
ind1^ind2

Index([3, 4, 5, 6, 'California', 'Florida', 'Illinois', 'New_York'], dtype='object')

In [96]:
ind1.difference(ind2)  #沒有 ind1-ind2

Index(['California', 'Florida', 'Illinois', 'New_York'], dtype='object')