## pandas的使用

In [28]:
from pandas import Series

print("用数组生成Series")
obj = Series([2,3,-5,8])
print(obj)
print(obj.values)
print(obj.index)

用数组生成Series
0    2
1    3
2   -5
3    8
dtype: int64
[ 2  3 -5  8]
RangeIndex(start=0, stop=4, step=1)


In [49]:
print('指定Series的index')
obj2 = Series([4,5,-1,3],index = ['a','b','a','c'])
print(obj2)
print()
print(obj2.index)
print()
print(obj2['a'])

obj2['d'] = 6#新增一列
print(obj2)
print()
print(obj2[['c','a']]) #用list作为一组索引
print()
print(obj2[obj2>0]) #使用逻辑不等式作为筛选条件
print()
print('b' in obj2)#判断索引是否存在
print()
print('e' in obj2)
print()
print(obj2.ix['c'])

指定Series的index
a    4
b    5
a   -1
c    3
dtype: int64

Index(['a', 'b', 'a', 'c'], dtype='object')

a    4
a   -1
dtype: int64
a    4
b    5
a   -1
c    3
d    6
dtype: int64

c    3
a    4
a   -1
dtype: int64

a    4
b    5
c    3
d    6
dtype: int64

True

False

3


In [20]:
print("使用字典生成Series")
sdata = {'Ohio':45000, 'Texas':71000, 'Oregon':16000, 'Utah':5000}#字典
obj3 = Series(sdata)
print(obj3)

使用字典生成Series
Ohio      45000
Oregon    16000
Texas     71000
Utah       5000
dtype: int64


In [21]:
print("使用字典生成Series，并额外指定index，不匹配的为NaN")
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = Series(sdata,index = states)
print(obj4)

使用字典生成Series，并额外指定index，不匹配的为NaN
California        NaN
Ohio          45000.0
Oregon        16000.0
Texas         71000.0
dtype: float64


In [22]:
print('Series相加，相同的索引部分相加')
print(obj3+obj4)

Series相加，相同的索引部分相加
California         NaN
Ohio           90000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64


In [23]:
print("指定Series及其索引的名字")
obj4.name = 'population'
obj4.index.name = 'state'
print(obj4)

指定Series及其索引的名字
state
California        NaN
Ohio          45000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64


In [29]:
print('替换index')
print(obj)
print()
obj.index =  ['Bob', 'Steve', 'Jeff', 'Ryan']
print(obj)

替换index
0    2
1    3
2   -5
3    8
dtype: int64

Bob      2
Steve    3
Jeff    -5
Ryan     8
dtype: int64


## DataFrame

In [34]:
import numpy as np
from pandas import Series, DataFrame

print('用字典生成DataFrame,key为列的名字')
data = {'state':['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
        'year':[2000, 2001, 2002, 2001, 2002],
        'pop':[1.5, 1.7, 3.6, 2.4, 2.9]}
print(DataFrame(data))
print()
print(DataFrame(data,columns = ['year','state','pop'])) #指定列的顺序

用字典生成DataFrame,key为列的名字
   pop   state  year
0  1.5    Ohio  2000
1  1.7    Ohio  2001
2  3.6    Ohio  2002
3  2.4  Nevada  2001
4  2.9  Nevada  2002

   year   state  pop
0  2000    Ohio  1.5
1  2001    Ohio  1.7
2  2002    Ohio  3.6
3  2001  Nevada  2.4
4  2002  Nevada  2.9


In [52]:
print('指定索引，在列中指定不存在的列，默认数据用NaN')
frame2 = DataFrame(data,
                  columns = ['year','state','pop','debt'],
                  index = ['one','two','three','four','five'])
print(frame2)
print()
print(frame2['state'])
print()
print(frame2.year)
print()
print(frame2.ix['three'])#索引的使用
print()
frame2['debt'] = 16.5 #修改一整列的数据
print(frame2)
print()
frame2.debt = np.arange(5) #用numpy数组范围函数修改一整列
print(frame2)

指定索引，在列中指定不存在的列，默认数据用NaN
       year   state  pop debt
one    2000    Ohio  1.5  NaN
two    2001    Ohio  1.7  NaN
three  2002    Ohio  3.6  NaN
four   2001  Nevada  2.4  NaN
five   2002  Nevada  2.9  NaN

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state, dtype: object

one      2000
two      2001
three    2002
four     2001
five     2002
Name: year, dtype: int64

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

       year   state  pop  debt
one    2000    Ohio  1.5  16.5
two    2001    Ohio  1.7  16.5
three  2002    Ohio  3.6  16.5
four   2001  Nevada  2.4  16.5
five   2002  Nevada  2.9  16.5

       year   state  pop  debt
one    2000    Ohio  1.5     0
two    2001    Ohio  1.7     1
three  2002    Ohio  3.6     2
four   2001  Nevada  2.4     3
five   2002  Nevada  2.9     4


In [53]:
print('用Series指定要修改的索引及其对应的值，没有指定的默认数据用NaN')
val = Series([-1.2, -1.5, -1.7], index = ['two', 'four', 'five'])
frame2['debt'] = val
print(frame2)

用Series指定要修改的索引及其对应的值，没有指定的默认数据用NaN
       year   state  pop  debt
one    2000    Ohio  1.5   NaN
two    2001    Ohio  1.7  -1.2
three  2002    Ohio  3.6   NaN
four   2001  Nevada  2.4  -1.5
five   2002  Nevada  2.9  -1.7


In [114]:
print("赋值给新列")
frame2['eastern'] = (frame2.state == 'Ohio') # 如果state等于Ohio为True
print(frame2)
print()
print(frame2.columns)

赋值给新列
       year   state  pop  debt eastern
one    2000    Ohio  1.5   NaN    True
two    2001    Ohio  1.7  -1.2    True
three  2002    Ohio  3.6   NaN    True
four   2001  Nevada  2.4  -1.5   False
five   2002  Nevada  2.9  -1.7   False

Index(['year', 'state', 'pop', 'debt', 'eastern'], dtype='object')


In [123]:
print('新增列')
frame2['xz'] = True
print(frame2)
print('删除列')
#del frame2['xz'] #二者取其一
frame2 = frame2.drop('xz',1)
print(frame2)

新增列
       year   state  pop  debt eastern    xz
one    2000    Ohio  1.5   NaN    True  True
two    2001    Ohio  1.7  -1.2    True  True
three  2002    Ohio  3.6   NaN    True  True
four   2001  Nevada  2.4  -1.5   False  True
five   2002  Nevada  2.9  -1.7   False  True
删除列
       year   state  pop  debt eastern
one    2000    Ohio  1.5   NaN    True
two    2001    Ohio  1.7  -1.2    True
three  2002    Ohio  3.6   NaN    True
four   2001  Nevada  2.4  -1.5   False
five   2002  Nevada  2.9  -1.7   False


In [60]:
print('DataFrame转置')
pop = {'Nevada':{2001:2.4, 2002:2.9},
        'Ohio':{2000:1.5, 2001:1.7, 2002:3.6}}
frame3 = DataFrame(pop)
print(frame3)
print()
print(frame3.T)

DataFrame转置
      Nevada  Ohio
2000     NaN   1.5
2001     2.4   1.7
2002     2.9   3.6

        2000  2001  2002
Nevada   NaN   2.4   2.9
Ohio     1.5   1.7   3.6


In [70]:
print('指定索引的顺序，以及使用切片初始化数据')
print(DataFrame(pop,index = [2001,2002,2003]))
print()
print(frame3['Ohio'][:-1]) # 指定列和行的索引
print(frame3['Nevada'][:2])
print()
pdata = {'Ohio':frame3['Ohio'][:-1], 'Nevada':frame3['Nevada'][:2]}
print()
print(DataFrame(pdata))

指定索引的顺序，以及使用切片初始化数据
      Nevada  Ohio
2001     2.4   1.7
2002     2.9   3.6
2003     NaN   NaN

2000    1.5
2001    1.7
Name: Ohio, dtype: float64
2000    NaN
2001    2.4
Name: Nevada, dtype: float64


      Nevada  Ohio
2000     NaN   1.5
2001     2.4   1.7


In [77]:
print('指定索引列的名称')
frame3.index.name = 'year'
frame3.columns.name = 'state'
print(frame3)
print()
print(frame3.values)
print()
print(frame2.values)

指定索引列的名称
state  Nevada  Ohio
year               
2000      NaN   1.5
2001      2.4   1.7
2002      2.9   3.6

[[ nan  1.5]
 [ 2.4  1.7]
 [ 2.9  3.6]]

[[2000 'Ohio' 1.5 nan True]
 [2001 'Ohio' 1.7 -1.2 True]
 [2002 'Ohio' 3.6 nan True]
 [2001 'Nevada' 2.4 -1.5 False]
 [2002 'Nevada' 2.9 -1.7 False]]


In [113]:
print(frame2)
print("根据索引号与名称筛选数据")
print()
frame2.ix[0:3, ['year','pop']]
print()
print(frame2[frame2.eastern])
print()
#print((frame2['year']>2001) & (frame2['pop']>2.4))
print(frame2[(frame2['year']>2001) & (frame2['pop']>2.4)])#使用逻辑与筛选条件

       year   state  pop  debt eastern
one    2000    Ohio  1.5   NaN    True
two    2001    Ohio  1.7  -1.2    True
three  2002    Ohio  3.6   NaN    True
four   2001  Nevada  2.4  -1.5   False
five   2002  Nevada  2.9  -1.7   False
根据索引号与名称筛选数据


       year state  pop  debt eastern
one    2000  Ohio  1.5   NaN    True
two    2001  Ohio  1.7  -1.2    True
three  2002  Ohio  3.6   NaN    True

       year   state  pop  debt eastern
three  2002    Ohio  3.6   NaN    True
five   2002  Nevada  2.9  -1.7   False


In [124]:
frame2.head(2)

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True


In [144]:
print(frame2)
print('新增一行')
fream2 = frame2.loc['six'] ={'year':2000,'state':'Nevada','pop':1.5,'debt':-1.2,'eastern':True}
print(frame2)
print('删除一行')
frame2 = frame2.drop('six')
print(frame2)


       year   state  pop debt eastern
one    2000    Ohio  1.5  NaN    True
two    2001    Ohio  1.7 -1.2    True
three  2002    Ohio  3.6  NaN    True
four   2001  Nevada  2.4 -1.5   False
five   2002  Nevada  2.9 -1.7   False
新增一行
       year   state  pop debt eastern
one    2000    Ohio  1.5  NaN    True
two    2001    Ohio  1.7 -1.2    True
three  2002    Ohio  3.6  NaN    True
four   2001  Nevada  2.4 -1.5   False
five   2002  Nevada  2.9 -1.7   False
six    2000  Nevada  1.5 -1.2    True
删除一行
       year   state  pop debt eastern
one    2000    Ohio  1.5  NaN    True
two    2001    Ohio  1.7 -1.2    True
three  2002    Ohio  3.6  NaN    True
four   2001  Nevada  2.4 -1.5   False
five   2002  Nevada  2.9 -1.7   False


In [163]:
print('检查字段是否有缺失值')
print(frame3)
print()
print(frame3['Nevada'].isnull())
print()
print(frame3.isnull().any())
print('计算缺失值')
frame3.Nevada.isnull().sum()

检查字段是否有缺失值
state  Nevada  Ohio
year               
2000      NaN   1.5
2001      2.4   1.7
2002      2.9   3.6

year
2000     True
2001    False
2002    False
Name: Nevada, dtype: bool

state
Nevada     True
Ohio      False
dtype: bool
计算缺失值


1

## 缺失值的舍弃
### 舍弃含有任意缺失值的行   df.dropna() 
### 舍弃所有字段都含有缺失值的行  df.dropna(how = 'all')
### 舍弃超过两栏缺失值的行     df.dropna(thresh = 2 )
### 舍弃皆为缺失值的列            df.dropna(axis = 1,hou = 'all')
### 用0填补缺失值                     df.fillna(0)
### 使用平均数缺失值                df.fillna(df['age'].mean())
### 按照线性规律进行缺失值的填充   df.interpolate()   内插方法具体参看文档