## DataFrame基础

### 1. 创建， 取索引，取值， 转置

#### 1.1 创建 DataFrame

In [1]:
import pandas as pd
import numpy as np

In [3]:
# 创建 DataFrame
data = {'state':['Ohio','Ohio','Ohio','Nevada','Nevada','Nevada'],
       'year':[2000,2001,2002,2001,2002,2003],
       'pop':[1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)

In [4]:
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [5]:
frame.head()
# 对于大数据，使用 head 方法只显示钱前 5 行

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [6]:
# 创建 DataFrame（带索引）
frame2 = pd.DataFrame(data,columns=['year', 'state', 'pop', 'debt'],
                     index=['one', 'two', 'three', 'four', 'five','six'])

In [7]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


#### 1.2 基本取值（取索引，取值）

In [8]:
# 取索引
frame2.index

Index(['one', 'two', 'three', 'four', 'five', 'six'], dtype='object')

In [9]:
# 取索引的值
frame2.index[0]

'one'

In [10]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [11]:
# 取值
frame2.values

array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, nan],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, nan],
       [2002, 'Nevada', 2.9, nan],
       [2003, 'Nevada', 3.2, nan]], dtype=object)

In [12]:
# 取列

# 选择表格中的'w'列，使用类字典属性,返回的是Series类型
frame2['state']

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

In [13]:
frame2.year
# 选择表格中的'w'列，使用点属性,返回的是Series类型
# 可以是中文

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

In [14]:
# 取行
frame2.loc['three']

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [24]:
# 取特定值
frame2[frame2.state == 'Ohio']  # 取 state = Ohio 的值，注意使用“ == ”

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,


#### 1.3 精确取值

In [25]:
# 选取某一列
# data['列名']
# data.loc[:, '列名']
# data.iloc[:, 列数]

# 选取某一行
# data.loc['行名']
# data.iloc[行数, :]

# 选取某一精确值
# data.loc['行名', '列名']
# data,iloc[行数， 列数]

# data[data.column_name == 'value']

In [16]:
obj = pd.Series(np.arange(4.),index=['a','b', 'c', 'd'])

In [17]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [18]:
obj[2:4]

c    2.0
d    3.0
dtype: float64

In [19]:
obj[[1,3]]

b    1.0
d    3.0
dtype: float64

In [20]:
obj[obj<2]

a    0.0
b    1.0
dtype: float64

In [26]:
data[:2]

TypeError: unhashable type: 'slice'

In [27]:
data < 5

TypeError: '<' not supported between instances of 'dict' and 'int'

In [None]:
data [data < 5] = 0

In [None]:
data

In [None]:
# 精确取行列值
data.loc['Colorado',['two', 'three']]

In [None]:
data.iloc[2,[3, 0, 1]]

#### 1.4 转置

In [None]:
pop = {'Nevada':{2001:2.4,2002:2.9},
      'Ohio':{2000:1.5, 2001:1.7, 2002:3.6}}

In [None]:
frame3 = pd.DataFrame(pop)

In [None]:
frame3

In [None]:
frame3.T
# 转置

### 2. 索引对象， 重新索引

#### 2.1 索引对象

In [None]:
obj = pd.Series(range(3),index=['a', 'b', 'c'])

In [None]:
index = obj.index

In [None]:
index

In [None]:
index[1]

In [None]:
index[1:]

In [None]:
obj

#### 2.2 重命名索引

#### 2.3.  重新索引 reindex

In [32]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6],index=['d','b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [72]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

参考资料[pandas中DataFrame重置索引的几种方法](https://www.jb51.net/article/213110.htm)

In [93]:
obj3 = pd.Series(['blue', 'purple', 'yellow'],index=[0, 2, 4])
obj3

0      blue
2    purple
4    yellow
dtype: object

In [35]:
obj3.reindex(range(6),method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [36]:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)),
                          index=['a', 'c', 'd'],
                          columns=['Ohio', 'Texas', 'California'])

In [37]:
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [38]:
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [39]:
states = ['Texas', 'Utah', 'California']

In [40]:
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [41]:
data =pd.DataFrame(np.arange(16).reshape((4, 4)),
                  index=['Ohio', 'Colorado', 'Utah', 'New York'],
                  columns=['one', 'two', 'three', 'four'])

In [42]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


#### 2.4 重置索引

In [90]:
ser = pd.Series([1, 2, 3, 4], index = [1, 3, 6, 9])
ser

1    1
3    2
6    3
9    4
dtype: int64

In [94]:
# 直接使用赋值重置索引
ser.index = range(len(ser))
ser

0    1
1    2
2    3
3    4
dtype: int64

### 3. 删除行或者列

In [None]:
data.drop(['Colorado', 'Ohio'])
# 删除行

In [None]:
data.drop('two', axis=1)
# 删除列

In [None]:
data.drop(['two', 'four'],axis='columns')

### 4. 运算

#### 4.1 数值运算

In [None]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5],index=['a', 'c', 'd', 'e'])

In [None]:
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1],
              index=['a', 'c', 'e', 'f', 'g'])

In [None]:
s1

In [None]:
s2

In [None]:
s1+s2

In [None]:
list('abcd')

In [None]:
df1 = pd.DataFrame(np.arange(12.).reshape((3,4)),
                   columns=list('abcd'))


In [None]:
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)),
                  columns=list('abcde'))

In [None]:
df1

In [None]:
df2

In [None]:
df1+df2

In [None]:
df1.add(df2, fill_value=0)

In [None]:
1 /df1

In [None]:
df1.rdiv(1)

In [None]:
a=1
b=1.0
a==b

In [None]:
1/df1==df1.rdiv(1)

In [None]:
df1.reindex(columns=df2.columns, fill_value=0)

In [None]:
df3=pd.DataFrame(np.arange(16.).reshape(4,4),columns=list('abcd'))

In [None]:
df3

In [None]:
df3.pow(2)

In [None]:
df3.rpow(2)

#### 4.2 DataFrame 运算

In [97]:
arr = np.arange(12.).reshape((3, 4))
arr

array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])

In [98]:
arr[0]

array([0., 1., 2., 3.])

In [99]:
arr-arr[0]

array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])

In [None]:
frame = pd.DataFrame(np.arange(12.).reshape((4,3)),
                     columns=list('bde'),
                     index=['Utah', 'Ohio', 'Texas', 'Oregen'])

In [None]:
frame

In [None]:
series = frame.iloc[0]

In [None]:
series

In [None]:
frame-series
# 索引相同,每个都减，广播下去

In [None]:
series2 = pd.Series(range(3),index=list('bef'))

In [28]:
series2

NameError: name 'series2' is not defined

In [None]:
frame + series2

In [None]:
series3 = frame['d']

In [None]:
series3

In [None]:
frame

In [None]:
frame.sub(series3,axis='index')
# 每列都减

### 5.排序

sort.index()根据索引进行排序

sort.value()根据值进行排序

sort.value(by = 'x') 根据 x 列的值进行排序

sort.index/value(ascending = False) 从大到小排序，默认为从小到大

无论采取哪种方法，缺失值都会排在最后

In [None]:
frame = pd.DataFrame({'b':[4, 7, -3, 2],'a':[0, 1, 0, 1]})

In [None]:
frame

In [None]:
frame.sort_values(by='b')

In [None]:
ssort = pd.Series(range(5), index = [list('badec')])
ssort

In [None]:
ssort.sort_index()

In [None]:
dfdata = {'Name':['Zhang San', 'Li Si', 'Wang Laowu', 'Zhao Liu', 'Qian Qi', 'Sun Ba'],
               'Subject':['Literature', 'History', 'English', 'Maths','Physics', 'Chemics'],
               'Score':[98, 97, 84, 70, 93, 83]}
scoresheet = pd.DataFrame(dfdata)
scoresheet

In [None]:
scoresheet.columns
# 查看列索引

使用嵌套字典创建dataframe，外部键——列，内部键——行

In [2]:
dfdata2 = {'Name':{101:'Zhang San', 102:'Li Si', 103:'Wang Laowu', 104:'Zhao Liu', 105:'Qian Qi', 106:'Sun Ba'},
               'Subject':{101:'Literature', 102:'History', 103:'English', 104:'Maths',105:'Physics', 106:'Chemics'},
               'Score':{101:98, 102:97, 103:84, 104:70, 105:93, 106:83}}
scoresheet2 = pd.DataFrame(dfdata2)
scoresheet2

Unnamed: 0,Name,Subject,Score
101,Zhang San,Literature,98
102,Li Si,History,97
103,Wang Laowu,English,84
104,Zhao Liu,Maths,70
105,Qian Qi,Physics,93
106,Sun Ba,Chemics,83


In [3]:
scoresheet2.Score

101    98
102    97
103    84
104    70
105    93
106    83
Name: Score, dtype: int64

In [4]:
# 按 index 排列
scoresheet2.index = [102, 101, 106, 104, 103, 105]
scoresheet2

Unnamed: 0,Name,Subject,Score
102,Zhang San,Literature,98
101,Li Si,History,97
106,Wang Laowu,English,84
104,Zhao Liu,Maths,70
103,Qian Qi,Physics,93
105,Sun Ba,Chemics,83


In [5]:
scoresheet2.sort_index()

Unnamed: 0,Name,Subject,Score
101,Li Si,History,97
102,Zhang San,Literature,98
103,Qian Qi,Physics,93
104,Zhao Liu,Maths,70
105,Sun Ba,Chemics,83
106,Wang Laowu,English,84


In [6]:
scoresheet2.sort_index(axis = 0, ascending=False)
# ascending = False  从大到小拍排

Unnamed: 0,Name,Subject,Score
106,Wang Laowu,English,84
105,Sun Ba,Chemics,83
104,Zhao Liu,Maths,70
103,Qian Qi,Physics,93
102,Zhang San,Literature,98
101,Li Si,History,97


In [7]:
按值排列
scoresheet2.sort_values(by = 'Score', ascending=False)
# 按指定列排

Unnamed: 0,Name,Subject,Score
102,Zhang San,Literature,98
101,Li Si,History,97
103,Qian Qi,Physics,93
106,Wang Laowu,English,84
105,Sun Ba,Chemics,83
104,Zhao Liu,Maths,70


### 6. 描述和统计
汇总、计算、描述性统计

In [8]:
df = pd.DataFrame([[1.4,np.nan],[7.1, -4.5],
                 [np.nan, np.nan],[0.75, -1.3]],
                 index = [list('abcd')],
                 columns = ['one', 'two'])

In [9]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [10]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [11]:
df.mean()

one    3.083333
two   -2.900000
dtype: float64

In [None]:
df.sum(axis= 'columns')

In [None]:
df.mean(axis=0)

In [None]:
df.mean(axis='columns')

In [None]:
df.mean(axis='columns',skipna=False)

In [None]:
df.idxmax()

In [12]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


- mean: 平均数
- std : 标准差
- 50% : 中位数

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 4 entries, ('a',) to ('d',)
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   one     3 non-null      float64
 1   two     2 non-null      float64
dtypes: float64(2)
memory usage: 288.0+ bytes


In [14]:
df.info

<bound method DataFrame.info of     one  two
a  1.40  NaN
b  7.10 -4.5
c   NaN  NaN
d  0.75 -1.3>

In [None]:
obj = pd.Series(['a', 'a', 'b', 'c'] * 4)

In [None]:
obj

### 7. 函数和映射

In [None]:
frame = pd.DataFrame(np.random.randn(4,3),columns = list('bde'),
                    index=['Utah', 'Ohio', 'Texas', 'Oregen'])

In [None]:
frame

In [None]:
np.abs(frame)

In [None]:
f = lambda x:x.max() - x.min()

In [None]:
frame.apply(f)
# 行

In [None]:
frame.apply(f, axis='columns')

In [None]:
frame.apply(f, axis='index')

In [None]:
def f(x):
    return pd.Series([x.min(),x.max()],index=['min','max'])
frame.apply(f)

In [None]:
format = lambda x: '%.2f' % x
frame.applymap(format)

In [None]:
frame['e'].map(format)

### 8.相关和协方差

In [None]:
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])

In [None]:
uniques = obj.unique()

In [None]:
uniques

In [None]:
obj.value_counts()
# 统计频率

In [None]:
obj

In [None]:
mask = obj.isin(['b', 'c'])
# 判断是否在

In [None]:
mask

In [None]:
obj[mask]