# 对官方十分钟入门pandas的学习
[link](http://pandas.pydata.org/pandas-docs/stable/10min.html)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
pd.__version__

'0.22.0'

In [3]:
s = pd.Series([1,3,5,np.nan,6,8])

In [4]:
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [5]:
dates = pd.date_range('20180222',periods=6)
dates

DatetimeIndex(['2018-02-22', '2018-02-23', '2018-02-24', '2018-02-25',
               '2018-02-26', '2018-02-27'],
              dtype='datetime64[ns]', freq='D')

In [6]:
#dataframe
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2018-02-22,-0.829921,-1.530877,0.591746,1.150491
2018-02-23,-0.52842,-0.198417,0.729501,-0.393763
2018-02-24,0.374025,2.083839,-0.866058,-0.366813
2018-02-25,1.085715,0.216089,-1.908359,-1.161613
2018-02-26,-0.579622,-0.590239,0.178399,-0.037379
2018-02-27,-0.777208,2.810331,0.943975,-0.02558


In [7]:
#dataframe 取值
## 列取值
### 对于有columns的dataframe
df['A']

2018-02-22   -0.829921
2018-02-23   -0.528420
2018-02-24    0.374025
2018-02-25    1.085715
2018-02-26   -0.579622
2018-02-27   -0.777208
Freq: D, Name: A, dtype: float64

In [8]:
### 取第一列
df.iloc[:,0]

2018-02-22   -0.829921
2018-02-23   -0.528420
2018-02-24    0.374025
2018-02-25    1.085715
2018-02-26   -0.579622
2018-02-27   -0.777208
Freq: D, Name: A, dtype: float64

In [9]:
###取第一列到第三列
df.iloc[:,0:3]

Unnamed: 0,A,B,C
2018-02-22,-0.829921,-1.530877,0.591746
2018-02-23,-0.52842,-0.198417,0.729501
2018-02-24,0.374025,2.083839,-0.866058
2018-02-25,1.085715,0.216089,-1.908359
2018-02-26,-0.579622,-0.590239,0.178399
2018-02-27,-0.777208,2.810331,0.943975


In [10]:
### 取第一列与第四列
df.iloc[:,[0,3]]

Unnamed: 0,A,D
2018-02-22,-0.829921,1.150491
2018-02-23,-0.52842,-0.393763
2018-02-24,0.374025,-0.366813
2018-02-25,1.085715,-1.161613
2018-02-26,-0.579622,-0.037379
2018-02-27,-0.777208,-0.02558


In [11]:
## 行取值
### 取第三行的值
df.iloc[2]

A    0.374025
B    2.083839
C   -0.866058
D   -0.366813
Name: 2018-02-24 00:00:00, dtype: float64

In [12]:
### 取第一行到第三行的值
df.iloc[0:3,:]

Unnamed: 0,A,B,C,D
2018-02-22,-0.829921,-1.530877,0.591746,1.150491
2018-02-23,-0.52842,-0.198417,0.729501,-0.393763
2018-02-24,0.374025,2.083839,-0.866058,-0.366813


In [13]:
### 取第一行与第三行
df.iloc[[0,2],:]

Unnamed: 0,A,B,C,D
2018-02-22,-0.829921,-1.530877,0.591746,1.150491
2018-02-24,0.374025,2.083839,-0.866058,-0.366813


In [14]:
## 取指定值
### 取第二行第二列的值
df.iat[1,1]

-0.19841725603170124

In [15]:
## 过滤取值
### 数值大小判断取值(第一列的值大于-0.5并且小于1)
df[(df.A > -0.5) & (df.A < 1)]

Unnamed: 0,A,B,C,D
2018-02-24,0.374025,2.083839,-0.866058,-0.366813


In [16]:
### 字符串过滤
### isin的使用
### 选取包含two与four字符串的行
df2 = df.copy()
df2['E'] = ['One', 'one','two','three','four','three']
df2[df2['E'].isin(['two','four'])]

Unnamed: 0,A,B,C,D,E
2018-02-24,0.374025,2.083839,-0.866058,-0.366813,two
2018-02-26,-0.579622,-0.590239,0.178399,-0.037379,four


## 缺失值的处理

In [17]:
df1 = df.reindex(index=dates[0:4],columns=list(df.columns) + ['E'])
df1.iloc[0:3,4] = 1
df1

Unnamed: 0,A,B,C,D,E
2018-02-22,-0.829921,-1.530877,0.591746,1.150491,1.0
2018-02-23,-0.52842,-0.198417,0.729501,-0.393763,1.0
2018-02-24,0.374025,2.083839,-0.866058,-0.366813,1.0
2018-02-25,1.085715,0.216089,-1.908359,-1.161613,


In [18]:
### 剔除包含缺失值的行
df1.dropna(how='any')

Unnamed: 0,A,B,C,D,E
2018-02-22,-0.829921,-1.530877,0.591746,1.150491,1.0
2018-02-23,-0.52842,-0.198417,0.729501,-0.393763,1.0
2018-02-24,0.374025,2.083839,-0.866058,-0.366813,1.0


In [19]:
### 缺失值填补 
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,E
2018-02-22,-0.829921,-1.530877,0.591746,1.150491,1.0
2018-02-23,-0.52842,-0.198417,0.729501,-0.393763,1.0
2018-02-24,0.374025,2.083839,-0.866058,-0.366813,1.0
2018-02-25,1.085715,0.216089,-1.908359,-1.161613,5.0


In [20]:
### To get the boolean mask where values are nan（用布尔值标记缺失值与否）
pd.isna(df1)

Unnamed: 0,A,B,C,D,E
2018-02-22,False,False,False,False,False
2018-02-23,False,False,False,False,False
2018-02-24,False,False,False,False,False
2018-02-25,False,False,False,False,True


In [37]:
### 列均值
df1.mean()

A    0.025350
B    0.142658
C   -0.363293
D   -0.192925
E    1.000000
dtype: float64

In [38]:
### 行均值
df1.mean(1)

2018-02-22    0.076288
2018-02-23    0.121780
2018-02-24    0.444999
2018-02-25   -0.442042
Freq: D, dtype: float64

In [41]:
### 应用函数
df1.apply(lambda x:x.max() - x.min())


A    1.915636
B    3.614716
C    2.637860
D    2.312104
E    0.000000
dtype: float64

In [42]:
s = pd.Series(['a','a','a','b','c','c'])

In [43]:
### 值出现次数统计
s.value_counts()

a    3
c    2
b    1
dtype: int64

In [44]:
df3 = pd.DataFrame(np.random.randn(10, 4))
df3

Unnamed: 0,0,1,2,3
0,0.90481,1.567343,-0.204436,0.645745
1,-0.646392,0.191708,0.34479,-0.282842
2,-0.030094,0.641869,-1.407287,-0.675787
3,-0.469255,-2.916481,-0.492129,0.953452
4,1.094815,0.333453,0.152898,1.38462
5,0.164347,-0.602686,-0.484619,0.545653
6,1.15196,0.035896,0.519152,0.075863
7,-0.182312,-1.404318,1.298998,-0.02168
8,0.515814,0.325971,0.480431,-0.507223
9,2.390475,-0.336878,-0.24565,-0.29239


In [45]:
### 拆开dataframe
a = df3[:3]
b = df3[3:6]
c = df3[6:]
a

Unnamed: 0,0,1,2,3
0,0.90481,1.567343,-0.204436,0.645745
1,-0.646392,0.191708,0.34479,-0.282842
2,-0.030094,0.641869,-1.407287,-0.675787


In [48]:
### 合并dataframe
pd.concat([a,b,c])

Unnamed: 0,0,1,2,3
0,0.90481,1.567343,-0.204436,0.645745
1,-0.646392,0.191708,0.34479,-0.282842
2,-0.030094,0.641869,-1.407287,-0.675787
3,-0.469255,-2.916481,-0.492129,0.953452
4,1.094815,0.333453,0.152898,1.38462
5,0.164347,-0.602686,-0.484619,0.545653
6,1.15196,0.035896,0.519152,0.075863
7,-0.182312,-1.404318,1.298998,-0.02168
8,0.515814,0.325971,0.480431,-0.507223
9,2.390475,-0.336878,-0.24565,-0.29239


In [55]:
df4 = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])


Unnamed: 0,A,B,C,D
0,-1.419114,0.492097,0.649176,-0.516502
1,-1.281209,-2.793202,-1.407697,0.080658
2,-0.607695,0.452532,-0.399474,0.063068
3,-0.066549,0.253413,-1.249015,1.054669
4,0.650785,-0.576049,1.087394,-1.349712
5,0.544507,0.243758,0.347719,0.103431
6,-0.335328,0.165773,1.207649,-0.32925
7,0.524231,-0.268242,-1.61639,-1.906382


In [57]:
### 在dataframe最后追加一行记录
s = df4.iloc[3]
df4.append(s,ignore_index=True)

Unnamed: 0,A,B,C,D
0,-1.419114,0.492097,0.649176,-0.516502
1,-1.281209,-2.793202,-1.407697,0.080658
2,-0.607695,0.452532,-0.399474,0.063068
3,-0.066549,0.253413,-1.249015,1.054669
4,0.650785,-0.576049,1.087394,-1.349712
5,0.544507,0.243758,0.347719,0.103431
6,-0.335328,0.165773,1.207649,-0.32925
7,0.524231,-0.268242,-1.61639,-1.906382
8,-0.066549,0.253413,-1.249015,1.054669


In [74]:
### 修改数据框中的数据类型
df5 = pd.DataFrame({'ID':['001','002','003','004'],'raw_grade':['a','a','b','c']})

In [75]:
df5['grade'] = df5['raw_grade'].astype("category")
df5['grade']

0    a
1    a
2    b
3    c
Name: grade, dtype: category
Categories (3, object): [a, b, c]

## Reading data

In [102]:
csv_df = pd.read_csv('./shan.csv')

In [77]:
csv_df.head()
#默认会将第一行设置为列的name

Unnamed: 0,Chr9,G,1006,CHH,CT,0.00,0,1
0,Chr9,G,1008,CHH,CA,0.0,0,1
1,Chr9,G,1011,CHH,CA,0.0,0,1
2,Chr9,G,1012,CHH,CC,0.0,0,1
3,Chr9,G,1016,CHH,CA,0.0,0,1
4,Chr9,G,1031,CHH,CT,0.0,0,1


In [103]:
csv_df = pd.read_csv('./shan.csv',header=None)
csv_df.head()
#在这里如果原文件中没有列名，则在这儿header参数应该设置为None;
#默认情况下文件第一行为列名为数字


Unnamed: 0,0,1,2,3,4,5,6,7
0,Chr9,G,1006,CHH,CT,0.0,0,1
1,Chr9,G,1008,CHH,CA,0.0,0,1
2,Chr9,G,1011,CHH,CA,0.0,0,1
3,Chr9,G,1012,CHH,CC,0.0,0,1
4,Chr9,G,1016,CHH,CA,0.0,0,1


In [110]:
# 若要修改列名，可以使用names参数
columns = ['Chrom','strand','pos','context','detail','lev','meth','total']

In [111]:
csv_df = pd.read_table('./shan.txt',header=None,sep='\t',names = columns)
csv_df.head()

Unnamed: 0,Chrom,strand,pos,context,detail,lev,meth,total
0,Chr9,G,1006,CHH,CT,0.0,0,1
1,Chr9,G,1008,CHH,CA,0.0,0,1
2,Chr9,G,1011,CHH,CA,0.0,0,1
3,Chr9,G,1012,CHH,CC,0.0,0,1
4,Chr9,G,1016,CHH,CA,0.0,0,1


# numpy的学习

对[官方Quickstart tutorial](https://docs.scipy.org/doc/numpy-dev/user/quickstart.html)的学习

## 矩阵的构建

In [114]:
import numpy as np
a = np.array([2,3,4])
#a = np.array(2,3,4) wrong
a

array([2, 3, 4])

In [115]:
b = np.array([(1,2,3),(4,5,6)])
b

array([[1, 2, 3],
       [4, 5, 6]])

In [116]:
c = np.array([[1,2,3],[4,5,6]])
c

array([[1, 2, 3],
       [4, 5, 6]])

In [117]:
d = np.array([[1,2,3],(4,5,6)])
d

array([[1, 2, 3],
       [4, 5, 6]])

In [118]:
np.zeros([3,4])
#np.zeros((3,4)) 一样的

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [119]:
np.ones([2,3,4])

array([[[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]],

       [[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]]])

In [120]:
np.arange(10,30,5)

array([10, 15, 20, 25])

In [121]:
np.arange(12).reshape(3,4)

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [122]:
np.linspace(0,2,9) #0-2内9个数字

array([0.  , 0.25, 0.5 , 0.75, 1.  , 1.25, 1.5 , 1.75, 2.  ])

## 一些基本操作

In [123]:
a = np.array([20,30,40,50])
b = a < 35
b

array([ True,  True, False, False])

In [124]:
### 三角函数运算
10*np.sin(a)

array([ 9.12945251, -9.88031624,  7.4511316 , -2.62374854])

In [125]:
### 区分元素相乘与矩阵的点乘运算
A = np.array([[1,1],[0,1]])
B = np.array([[2,0],[3,4]])


In [126]:
A*B #每个元素对应相乘

array([[2, 0],
       [0, 4]])

In [127]:
A.dot(B) #矩阵相乘运算

array([[5, 4],
       [3, 4]])

In [128]:
np.dot(A,B)

array([[5, 4],
       [3, 4]])

In [130]:
e = np.random.random((2,3))
e

array([[0.04125501, 0.35631681, 0.79138835],
       [0.76352603, 0.25745864, 0.18552123]])

In [132]:
#矩阵的一些计算
#矩阵中所有值求和
e.sum()

2.3954660653985784

In [133]:
##矩阵中最小值
e.min()

0.04125501115669161

In [134]:
##矩阵中最大值
e.max()

0.7913883458354225

In [136]:
##矩阵行求和
e.sum(axis=1)

array([1.18896017, 1.2065059 ])

In [137]:
##矩阵中列求和
e.sum(axis=0)

array([0.80478104, 0.61377545, 0.97690957])

In [138]:
##对每行进行累计求和
e.cumsum(axis=1)

array([[0.04125501, 0.39757182, 1.18896017],
       [0.76352603, 1.02098467, 1.2065059 ]])

In [140]:
##一些通用函数
B = np.arange(3)
np.exp(B)

array([1.        , 2.71828183, 7.3890561 ])

In [141]:
np.sqrt(B)

array([0.        , 1.        , 1.41421356])

In [145]:
## 遍历多维数组中的每个元素
def f(x,y):
    return 10*x+y
b = np.fromfunction(f,(5,4),dtype=int)

for element in b.flat:
    print(element,end=' ')

0 1 2 3 10 11 12 13 20 21 22 23 30 31 32 33 40 41 42 43 

In [147]:
# 重塑多维数组
## 将多维数组转变为一维数组

a = np.floor(10*np.random.random((3,4)))
a

array([[6., 9., 4., 6.],
       [3., 9., 7., 7.],
       [4., 5., 4., 3.]])

In [146]:
a.ravel()

array([0., 1., 5., 9., 3., 2., 3., 9., 9., 6., 7., 9.])

In [148]:
a.reshape(6,2)

array([[6., 9.],
       [4., 6.],
       [3., 9.],
       [7., 7.],
       [4., 5.],
       [4., 3.]])

In [149]:
a.T

array([[6., 3., 4.],
       [9., 9., 5.],
       [4., 7., 4.],
       [6., 7., 3.]])

In [152]:
## reshape与resize的区别在于前者返回修改后的内容而后者直接修改矩阵的形状
a.resize((2,6))

In [153]:
a

array([[6., 9., 4., 6., 3., 9.],
       [7., 7., 4., 5., 4., 3.]])

In [154]:
## 如果一个维度值为-1程序将自动计算
a.reshape(3,-1)

array([[6., 9., 4., 6.],
       [3., 9., 7., 7.],
       [4., 5., 4., 3.]])

In [None]:
## 矩阵堆叠
a = np.floor(10*np.random.random((2,2)))
b = np.floor()