In [1]:
import pandas as pd

# 2 Series

In [3]:
# 生成一个Series
ser_obj = pd.Series(range(10, 20)) # 默认索引是0-9
print(ser_obj) # 打印输出会带有类型 dtype: int64

0    10
1    11
2    12
3    13
4    14
5    15
6    16
7    17
8    18
9    19
dtype: int64


In [4]:
print('-'*50)
# 获取数据
print(ser_obj.values)  # values实际是ndarray
print(type(ser_obj.values)) # <class 'numpy.ndarray'>

# 获取索引
print(ser_obj.index)  # 内部自带的类型--RangeIndex(start=0, stop=10, step=1)
ser_obj.dtype # 数据类型 dtype('int64')

--------------------------------------------------
[10 11 12 13 14 15 16 17 18 19]
<class 'numpy.ndarray'>
RangeIndex(start=0, stop=10, step=1)


dtype('int64')

In [6]:
print(ser_obj[0])   # 按index访问values
ser_obj[9] # np.int64(19)
# 访问不存在的索引下标会报keyerror

10


np.int64(19)

In [7]:
print(ser_obj * 2)  # 元素级乘法
print(ser_obj > 15) # 返回一个bool序列

0    20
1    22
2    24
3    26
4    28
5    30
6    32
7    34
8    36
9    38
dtype: int64
0    False
1    False
2    False
3    False
4    False
5    False
6     True
7     True
8     True
9     True
dtype: bool


In [9]:
# 字典变为series，索引是字典的key，value是字典的value，感受非默认索引
year_data = {2001: 17.8, 2005: 20.1, 2003: 16.5}
ser_obj2 = pd.Series(year_data)
print(ser_obj2)
print('-'*50)

print(ser_obj2.index)
print('-'*50)

print(ser_obj2[2001])
ser_obj2.values     # array([17.8, 20.1, 16.5])

2001    17.8
2005    20.1
2003    16.5
dtype: float64
--------------------------------------------------
Index([2001, 2005, 2003], dtype='int64')
--------------------------------------------------
17.8


array([17.8, 20.1, 16.5])

In [10]:
print(ser_obj2.name) # Series名字
print(ser_obj2.index.name)  # 索引名字

ser_obj2.name = 'temp'
ser_obj2.index.name = 'year1'
print('-'*50)

print(ser_obj2.head())  # head默认显示前5行

None
None
--------------------------------------------------
year1
2001    17.8
2005    20.1
2003    16.5
Name: temp, dtype: float64


# 3 DataFrame

In [12]:
import numpy as np

# 通过ndarray构建DataFrame
t = pd.DataFrame(np.arange(12).reshape((3,4))) # 默认索引是0-2
print(t)
print('-'*50)

   0  1   2   3
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11
--------------------------------------------------


In [13]:
array = np.random.randn(5,4)    # 5行4列的正态数组
print(array)
print('-'*50)

df_obj = pd.DataFrame(array)    # 生成df
print(df_obj.head()) #默认显示前5行

[[ 5.48549194e-01 -4.81294191e-01  5.18920136e-01  8.09225085e-01]
 [-1.19910157e+00 -1.30942655e+00  3.21596928e-01  6.36469310e-01]
 [-1.54073677e+00  4.93726051e-04 -1.97625231e-01  3.95976085e-01]
 [ 1.90069360e+00  5.23660417e-01  9.10651093e-01 -9.78582798e-01]
 [ 2.56329522e-01  4.36560428e-01 -8.84540159e-01 -1.04325331e+00]]
--------------------------------------------------
          0         1         2         3
0  0.548549 -0.481294  0.518920  0.809225
1 -1.199102 -1.309427  0.321597  0.636469
2 -1.540737  0.000494 -0.197625  0.395976
3  1.900694  0.523660  0.910651 -0.978583
4  0.256330  0.436560 -0.884540 -1.043253


In [16]:
df_obj.loc[0] # 单独把某一行取出来,类型是series

0    0.548549
1   -0.481294
2    0.518920
3    0.809225
Name: 0, dtype: float64

In [17]:
# 列表套字典  生成df
d2 =[{"name" : "xiaohong" ,"age" :32,"tel" :10010},
     { "name": "xiaogang" ,"tel": 10000} ,
     {"name":"xiaowang" ,"age":22}]
df6=pd.DataFrame(d2)
print(df6) # 缺失值会用NaN填充
print(type(df6.values)) # ndarray

       name   age      tel
0  xiaohong  32.0  10010.0
1  xiaogang   NaN  10000.0
2  xiaowang  22.0      NaN
<class 'numpy.ndarray'>


In [18]:
pd.Series(1, index=list(range(3,7)),dtype='float32')

3    1.0
4    1.0
5    1.0
6    1.0
dtype: float32

In [19]:
# df中不同列可以是不同的数据类型,同一列必须是一个数据类型
import pandas as pd
import numpy as np
dict_data = {'A': 1,
             'B': pd.Timestamp('20190926'),
             'C': pd.Series(1, index=list(range(4)),dtype='float32'),
             'D': np.array([1,2,3,4],dtype='int32'),
             'E': ["Python","Java","C++","C"],
             'F': 'wangdao' }
df_obj2 = pd.DataFrame(dict_data)
print(df_obj2)

   A          B    C  D       E        F
0  1 2019-09-26  1.0  1  Python  wangdao
1  1 2019-09-26  1.0  2    Java  wangdao
2  1 2019-09-26  1.0  3     C++  wangdao
3  1 2019-09-26  1.0  4       C  wangdao


In [21]:
print('-'*50)
print(df_obj2.index) # 行索引,重点
# df_obj2.index[0]=2  # 不可以单独修改某个索引值
print(df_obj2.columns) # 列索引，重点
df_obj2.dtypes # 每一列的数据类型，重点S

--------------------------------------------------
Index([0, 1, 2, 3], dtype='int64')
Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')


A            int64
B    datetime64[s]
C          float32
D            int32
E           object
F           object
dtype: object

In [22]:
# 感受日期,初始化df，设置行索引，列索引
dates = pd.date_range('20130101', periods=6) # 默认freq='D'，即天
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
print(df)
print('-'*50)
print(df.index)     # dtype='datetime64[ns]', freq='D')

                   A         B         C         D
2013-01-01 -0.327325  0.149239 -0.713582  0.161479
2013-01-02 -0.191231 -1.291163  1.294531  1.651849
2013-01-03 -0.443085 -0.077461 -1.775731  0.187837
2013-01-04  1.184171  1.411976 -0.641058  0.643947
2013-01-05 -1.346167 -1.164680  0.549779 -0.216832
2013-01-06  0.654101 -0.765330 -0.536409 -1.956108
--------------------------------------------------
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')


In [24]:
# 取数据
print(df_obj2)
print('-'*50)
print(type(df_obj2))    # <class 'pandas.core.frame.DataFrame'>
print('-'*50)

#pd中使用索引名来取某一行，或者列
print(df_obj2['B'])
print('-'*50)

#把df的某一列取出来是series
print(type(df_obj2['B']))   # <class 'pandas.core.series.Series'>

   A          B    C  D       E        F
0  1 2019-09-26  1.0  1  Python  wangdao
1  1 2019-09-26  1.0  2    Java  wangdao
2  1 2019-09-26  1.0  3     C++  wangdao
3  1 2019-09-26  1.0  4       C  wangdao
--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
--------------------------------------------------
0   2019-09-26
1   2019-09-26
2   2019-09-26
3   2019-09-26
Name: B, dtype: datetime64[s]
--------------------------------------------------
<class 'pandas.core.series.Series'>


In [25]:
# 增加列数据，列名是自定义的
df_obj2['G'] = df_obj2['D'] + 4
print(df_obj2.head())

   A          B    C  D       E        F  G
0  1 2019-09-26  1.0  1  Python  wangdao  5
1  1 2019-09-26  1.0  2    Java  wangdao  6
2  1 2019-09-26  1.0  3     C++  wangdao  7
3  1 2019-09-26  1.0  4       C  wangdao  8


In [24]:
# 删除列
del(df_obj2['G'])
print(df_obj2.head())

   A          B    C  D       E        F
0  1 2019-09-26  1.0  1  Python  wangdao
1  1 2019-09-26  1.0  2    Java  wangdao
2  1 2019-09-26  1.0  3     C++  wangdao
3  1 2019-09-26  1.0  4       C  wangdao
