In [1]:
import pandas as pd

# DataFrameの作成

## リストからDataFrameを作成する

In [2]:
lst = [[1,1,1,], [2,4,8], [3,9,27]]
for x in lst:
    print(x)

[1, 1, 1]
[2, 4, 8]
[3, 9, 27]


In [3]:
df = pd.DataFrame(lst, columns=['linear', 'square', 'cubic'], index=['d1', 'd2', 'd3'])
df

Unnamed: 0,linear,square,cubic
d1,1,1,1
d2,2,4,8
d3,3,9,27


In [4]:
df = pd.DataFrame(lst, columns=['linear', 'square', 'cubic'])
df

Unnamed: 0,linear,square,cubic
0,1,1,1
1,2,4,8
2,3,9,27


In [5]:
df = pd.DataFrame(lst, index=['d1', 'd2', 'd3'])
df

Unnamed: 0,0,1,2
d1,1,1,1
d2,2,4,8
d3,3,9,27


## NumPyの配列からDataFrameを作成する

In [6]:
import numpy as np
ar = np.array(lst)
ar

array([[ 1,  1,  1],
       [ 2,  4,  8],
       [ 3,  9, 27]])

In [7]:
df = pd.DataFrame(ar, columns=['linear', 'square', 'cubic'])
df

Unnamed: 0,linear,square,cubic
0,1,1,1
1,2,4,8
2,3,9,27


## 辞書からDataFrameを作成する

In [8]:
dc = {'linear': [1, 2, 3], 'square': [1, 4, 9], 'cubic': [1, 8, 27]}
dc

{'linear': [1, 2, 3], 'square': [1, 4, 9], 'cubic': [1, 8, 27]}

In [9]:
df = pd.DataFrame(dc)
df

Unnamed: 0,linear,square,cubic
0,1,1,1
1,2,4,8
2,3,9,27


# DataFrameの要素へアクセス

In [10]:
# 空のDataFrameを作成
df2=pd.DataFrame()
df2

## atによる要素へのアクセス

In [11]:
df2.at['d1', 'linear'] = 1

In [12]:
df2.at['d2', 'square'] = 4
df2

Unnamed: 0,linear,square
d1,1.0,
d2,,4.0


In [13]:
df2.at['d3', 'cubic'] = 27
df2

Unnamed: 0,linear,square,cubic
d1,1.0,,
d2,,4.0,
d3,,,27.0


# NaNを一括して別の値に置き換える方法

In [14]:
df3 = pd.DataFrame()
df3.at['d1', 'linear'] = 1
df3.at['d2', 'square'] = 4
df3.at['d3', 'cubic'] = 27
df3


Unnamed: 0,linear,square,cubic
d1,1.0,,
d2,,4.0,
d3,,,27.0


In [15]:
df31 = df3.fillna(0)
df31

Unnamed: 0,linear,square,cubic
d1,1.0,0.0,0.0
d2,0.0,4.0,0.0
d3,0.0,0.0,27.0


## iatによる要素へのアクセス

In [16]:
df2 = df
df2

Unnamed: 0,linear,square,cubic
0,1,1,1
1,2,4,8
2,3,9,27


In [17]:
df2.iat[2,1]

9

## locによるアクセス

In [18]:
df2

Unnamed: 0,linear,square,cubic
0,1,1,1
1,2,4,8
2,3,9,27


In [19]:
df2.index=['d1', 'd2', 'd3']

In [20]:
df2

Unnamed: 0,linear,square,cubic
d1,1,1,1
d2,2,4,8
d3,3,9,27


In [21]:
df2.loc['d1':'d2', 'linear':'square']

Unnamed: 0,linear,square
d1,1,1
d2,2,4


In [22]:
df2.loc['d1':'d2', :]

Unnamed: 0,linear,square,cubic
d1,1,1,1
d2,2,4,8


In [23]:
df2.loc[:, 'linear':'square']

Unnamed: 0,linear,square
d1,1,1
d2,2,4
d3,3,9


In [24]:
df2.loc[['d1','d3'], ['linear', 'cubic']]

Unnamed: 0,linear,cubic
d1,1,1
d3,3,27


## ilocによるアクセス

In [25]:
df2.iloc[0:2, 0:2]

Unnamed: 0,linear,square
d1,1,1
d2,2,4


In [26]:
df2.iloc[[0,2], [0,2]]

Unnamed: 0,linear,cubic
d1,1,1
d3,3,27


# 特定の列(カラム)にアクセスする方法

## sliceを指定する方法

In [27]:
df2

Unnamed: 0,linear,square,cubic
d1,1,1,1
d2,2,4,8
d3,3,9,27


In [28]:
df2['linear']

d1    1
d2    2
d3    3
Name: linear, dtype: int64

In [29]:
c = pd.Series(['Taro', 'Jiro', 'Hanako'], index=['d1', 'd2', 'd3'])
c

d1      Taro
d2      Jiro
d3    Hanako
dtype: object

In [30]:
df2['name'] = c
df2

Unnamed: 0,linear,square,cubic,name
d1,1,1,1,Taro
d2,2,4,8,Jiro
d3,3,9,27,Hanako


### 複数のカラムを取り出す方法

In [31]:
df2[['linear', 'cubic']]

Unnamed: 0,linear,cubic
d1,1,1
d2,2,8
d3,3,27


### 数値のカラム名も使用可能

In [32]:
df3 = pd.DataFrame([[1,2], [3,4]])
df3

Unnamed: 0,0,1
0,1,2
1,3,4


In [33]:
df3[0]

0    1
1    3
Name: 0, dtype: int64

## ドットを用いる方法

In [34]:
df4 = pd.DataFrame([['中村', '大阪府', 55],['田中', '東京都', 33]],columns=['name', 'address', 'age'])
df4

Unnamed: 0,name,address,age
0,中村,大阪府,55
1,田中,東京都,33


In [35]:
df4.name

0    中村
1    田中
Name: name, dtype: object

### 新規カラムの追加はドット表記では出来ない(抽出のみ)

In [36]:
# df4.gender = ['男', '女']

In [37]:
df4['gender'] = ['男', '女']
df4

Unnamed: 0,name,address,age,gender
0,中村,大阪府,55,男
1,田中,東京都,33,女


# 行の取り出し

In [38]:
df2

Unnamed: 0,linear,square,cubic,name
d1,1,1,1,Taro
d2,2,4,8,Jiro
d3,3,9,27,Hanako


In [39]:
df2.loc['d2']

linear       2
square       4
cubic        8
name      Jiro
Name: d2, dtype: object

In [40]:
df2.iloc[1]

linear       2
square       4
cubic        8
name      Jiro
Name: d2, dtype: object

### 結論)Seriesの形で得られ、元のDataFrameのカラムがインデックスになる

# 行の追加

In [41]:
lin = pd.Series([4, 16, 64, 'Junko'], index=['linear', 'square', 'cubic', 'name'])
lin

linear        4
square       16
cubic        64
name      Junko
dtype: object

In [42]:
df2.loc['d4'] = lin
df2

Unnamed: 0,linear,square,cubic,name
d1,1,1,1,Taro
d2,2,4,8,Jiro
d3,3,9,27,Hanako
d4,4,16,64,Junko


# DataFrameの内容をNumPyの配列に変換する方法

In [43]:
df2.values# 古いpandasでの方法

array([[1, 1, 1, 'Taro'],
       [2, 4, 8, 'Jiro'],
       [3, 9, 27, 'Hanako'],
       [4, 16, 64, 'Junko']], dtype=object)

In [44]:
df2.to_numpy()#新しいpandasでの方法(推奨)

array([[1, 1, 1, 'Taro'],
       [2, 4, 8, 'Jiro'],
       [3, 9, 27, 'Hanako'],
       [4, 16, 64, 'Junko']], dtype=object)

## インデックス、カラムを明に取り出す方法

In [45]:
df2.index

Index(['d1', 'd2', 'd3', 'd4'], dtype='object')

In [46]:
df2.columns

Index(['linear', 'square', 'cubic', 'name'], dtype='object')

In [47]:
df2.index.to_numpy()

array(['d1', 'd2', 'd3', 'd4'], dtype=object)

## 指定したIndex、Columnsを調べる方法

In [48]:
df2.index.get_loc('d3')

2

In [49]:
df2.columns.get_loc('square')

1

# 整列(ソート)

## 値に沿った整列

In [50]:
df2

Unnamed: 0,linear,square,cubic,name
d1,1,1,1,Taro
d2,2,4,8,Jiro
d3,3,9,27,Hanako
d4,4,16,64,Junko


In [51]:
df21 = df2.sort_values('name')
df21

Unnamed: 0,linear,square,cubic,name
d3,3,9,27,Hanako
d2,2,4,8,Jiro
d4,4,16,64,Junko
d1,1,1,1,Taro


In [52]:
df2.sort_values('name', ascending=False)

Unnamed: 0,linear,square,cubic,name
d1,1,1,1,Taro
d4,4,16,64,Junko
d2,2,4,8,Jiro
d3,3,9,27,Hanako


In [53]:
df21

Unnamed: 0,linear,square,cubic,name
d3,3,9,27,Hanako
d2,2,4,8,Jiro
d4,4,16,64,Junko
d1,1,1,1,Taro


In [54]:
df21.sort_index()

Unnamed: 0,linear,square,cubic,name
d1,1,1,1,Taro
d2,2,4,8,Jiro
d3,3,9,27,Hanako
d4,4,16,64,Junko


# 行、列の削除

In [55]:
df2

Unnamed: 0,linear,square,cubic,name
d1,1,1,1,Taro
d2,2,4,8,Jiro
d3,3,9,27,Hanako
d4,4,16,64,Junko


In [56]:
df22 = df2.drop(index='d4')
df22

Unnamed: 0,linear,square,cubic,name
d1,1,1,1,Taro
d2,2,4,8,Jiro
d3,3,9,27,Hanako


In [57]:
df22 = df2.drop(columns='name')
df22

Unnamed: 0,linear,square,cubic
d1,1,1,1
d2,2,4,8
d3,3,9,27
d4,4,16,64


In [58]:
df2

Unnamed: 0,linear,square,cubic,name
d1,1,1,1,Taro
d2,2,4,8,Jiro
d3,3,9,27,Hanako
d4,4,16,64,Junko


In [59]:
df22 = df2.drop(index='d4', columns='name')
df22

Unnamed: 0,linear,square,cubic
d1,1,1,1
d2,2,4,8
d3,3,9,27


In [60]:
df22 = df2.drop(index=['d1', 'd4'], columns=['linear', 'name'])
df22

Unnamed: 0,square,cubic
d2,4,8
d3,9,27


## カラムの抹消

In [61]:
del df2['name']

In [62]:
df2

Unnamed: 0,linear,square,cubic
d1,1,1,1
d2,2,4,8
d3,3,9,27
d4,4,16,64


# DataFrameの連結
## 行方向に単純に連結する

In [63]:
dfA = pd.DataFrame([['1a', '1b'], ['2a', '2b']], index=['d1', 'd2'], columns=['a', 'b'])
dfB = pd.DataFrame([['3a', '3b'], ['4a', '4b']], index=['d3', 'd4'], columns=['a', 'b'])
dfB

Unnamed: 0,a,b
d3,3a,3b
d4,4a,4b


In [64]:
display(dfA); display(dfB)

Unnamed: 0,a,b
d1,1a,1b
d2,2a,2b


Unnamed: 0,a,b
d3,3a,3b
d4,4a,4b


In [65]:
dfAB = pd.concat([dfA, dfB])
dfAB

Unnamed: 0,a,b
d1,1a,1b
d2,2a,2b
d3,3a,3b
d4,4a,4b


In [66]:
dfBA = pd.concat([dfB, dfA])
dfBA

Unnamed: 0,a,b
d3,3a,3b
d4,4a,4b
d1,1a,1b
d2,2a,2b


In [67]:
# 共通しないカラムがある場合
dfC = pd.DataFrame([['3c', '3d'], ['4c', '4d']], index=['d3', 'd4'], columns=['c', 'd'])
dfC

Unnamed: 0,c,d
d3,3c,3d
d4,4c,4d


In [68]:
dfAC = pd.concat([dfA, dfC])
dfAC

Unnamed: 0,a,b,c,d
d1,1a,1b,,
d2,2a,2b,,
d3,,,3c,3d
d4,,,4c,4d


# 横方向(カラム方向)の連結

In [69]:
dfD = pd.DataFrame([['1e', '1f'], ['2e', '2f'], ['3e', '3f']], index=['d1', 'd2', 'd3'], columns=['e', 'f'])
dfD

Unnamed: 0,e,f
d1,1e,1f
d2,2e,2f
d3,3e,3f


In [73]:
dfAD = pd.concat([dfA, dfD], axis=1)
dfAD

Unnamed: 0,a,b,e,f
d1,1a,1b,1e,1f
d2,2a,2b,2e,2f
d3,,,3e,3f


### 結論)共通するIndexが対応する

# 条件指定によるデータの抽出

In [74]:
df3 = pd.DataFrame([
    ['taro', 'male', 35],
    ['hanako', 'female', 31],
    ['jiro', 'male', 23],
    ['junko', 'female', 21]    
], columns=['name', 'gender', 'age'])
df3

Unnamed: 0,name,gender,age
0,taro,male,35
1,hanako,female,31
2,jiro,male,23
3,junko,female,21


## マスクの用意

In [75]:
cond = [False, True, False, False]

In [78]:
#スライスに真理値のマスクを与える
df31 = df3[cond]
df31

Unnamed: 0,name,gender,age
1,hanako,female,31


In [81]:
df3[df3['gender'] == 'female']

Unnamed: 0,name,gender,age
1,hanako,female,31
3,junko,female,21


## 複雑な条件式を使用

In [83]:
df3[(df3['gender'] == 'female') & (df3['age'] < 30)]
# 「かつ」で条件を連結

Unnamed: 0,name,gender,age
3,junko,female,21


In [84]:
df3[~(df3['gender'] == 'male')]

Unnamed: 0,name,gender,age
1,hanako,female,31
3,junko,female,21


# DataFrameに関する情報の取得

## 要約統計量

In [85]:
df2

Unnamed: 0,linear,square,cubic
d1,1,1,1
d2,2,4,8
d3,3,9,27
d4,4,16,64


In [87]:
df2.describe()

Unnamed: 0,linear,square,cubic
count,4.0,4.0,4.0
mean,2.5,7.5,25.0
std,1.290994,6.557439,28.225284
min,1.0,1.0,1.0
25%,1.75,3.25,6.25
50%,2.5,6.5,17.5
75%,3.25,10.75,36.25
max,4.0,16.0,64.0


## データ構造に関する情報の取得

In [88]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, d1 to d4
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   linear  4 non-null      int64
 1   square  4 non-null      int64
 2   cubic   4 non-null      int64
dtypes: int64(3)
memory usage: 300.0+ bytes
