In [1]:
import numpy as np
import  pandas as pd

# 定义索引信息
index = pd.Index(data=["zhangsan","lishi","wangwu","zhaoliu","wanger"])
data = {
    "age":[22,18,20,21,25],
    "address":["nj","bj","nj","sh","bj"]
}
# 构建DataFrame1:数据完整，提供索引
base_info = pd.DataFrame(data=data,index=index)
base_info['gender'] = ['f','m','f','m','f']
print(base_info)
print('*'*20)


          age address gender
zhangsan   22      nj      f
lishi      18      bj      m
wangwu     20      nj      f
zhaoliu    21      sh      m
wanger     25      bj      f
********************


In [2]:
base_info.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, zhangsan to wanger
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   age      5 non-null      int64 
 1   address  5 non-null      object
 2   gender   5 non-null      object
dtypes: int64(1), object(2)
memory usage: 160.0+ bytes


In [3]:
print(base_info.head(3))

          age address gender
zhangsan   22      nj      f
lishi      18      bj      m
wangwu     20      nj      f


In [4]:
print(base_info.tail(3))

         age address gender
wangwu    20      nj      f
zhaoliu   21      sh      m
wanger    25      bj      f


In [6]:
row, col = base_info.shape
print(row, col)

5 3


In [8]:
print(base_info.values)
print(base_info['age'])
print(base_info['age'].max())

[[22 'nj' 'f']
 [18 'bj' 'm']
 [20 'nj' 'f']
 [21 'sh' 'm']
 [25 'bj' 'f']]
zhangsan    22
lishi       18
wangwu      20
zhaoliu     21
wanger      25
Name: age, dtype: int64
25


In [9]:
print(base_info['age'].min())
print(base_info['age'].mean())
print(base_info['age'].sum())


18
21.2
106


In [10]:
# 相当于是Scala中的leftReduce,保存中间步骤，从左向右累加
print(base_info['age'].cumsum())


zhangsan     22
lishi        40
wangwu       60
zhaoliu      81
wanger      106
Name: age, dtype: int64


In [11]:
# 全面的统计指标
print(base_info.describe())
print(base_info['address'].describe(include=[object]))


             age
count   5.000000
mean   21.200000
std     2.588436
min    18.000000
25%    20.000000
50%    21.000000
75%    22.000000
max    25.000000
count      5
unique     3
top       nj
freq       2
Name: address, dtype: object


In [12]:
# 最大值索引
print(base_info['age'].idxmax())


wanger


In [13]:
# 离散化：分桶（hive），也就是分成多个区间
# 等距
print(pd.cut(base_info['age'],3))


zhangsan    (20.333, 22.667]
lishi       (17.993, 20.333]
wangwu      (17.993, 20.333]
zhaoliu     (20.333, 22.667]
wanger        (22.667, 25.0]
Name: age, dtype: category
Categories (3, interval[float64]): [(17.993, 20.333] < (20.333, 22.667] < (22.667, 25.0]]


In [14]:
# 手动区间
print(pd.cut(base_info['age'],[1,19,24,30]))


zhangsan    (19, 24]
lishi        (1, 19]
wangwu      (19, 24]
zhaoliu     (19, 24]
wanger      (24, 30]
Name: age, dtype: category
Categories (3, interval[int64]): [(1, 19] < (19, 24] < (24, 30]]


In [15]:
print(pd.cut(base_info['age'],[1,19,24,30],labels=["小","中","大"]))

zhangsan    中
lishi       小
wangwu      中
zhaoliu     中
wanger      大
Name: age, dtype: category
Categories (3, object): ['小' < '中' < '大']


In [16]:
# 频次:修改年龄的值
print(pd.qcut(base_info['age'],3))


zhangsan      (21.667, 25.0]
lishi       (17.999, 20.333]
wangwu      (17.999, 20.333]
zhaoliu     (20.333, 21.667]
wanger        (21.667, 25.0]
Name: age, dtype: category
Categories (3, interval[float64]): [(17.999, 20.333] < (20.333, 21.667] < (21.667, 25.0]]


In [17]:
# 排序：汉字不可靠，按轴（索引，列）
print(base_info.sort_index())
print(base_info.sort_index(ascending=False))


          age address gender
lishi      18      bj      m
wanger     25      bj      f
wangwu     20      nj      f
zhangsan   22      nj      f
zhaoliu    21      sh      m
          age address gender
zhaoliu    21      sh      m
zhangsan   22      nj      f
wangwu     20      nj      f
wanger     25      bj      f
lishi      18      bj      m


In [18]:
# 按轴排序，简单说按列名排序
print(base_info.sort_index(axis=1))


         address  age gender
zhangsan      nj   22      f
lishi         bj   18      m
wangwu        nj   20      f
zhaoliu       sh   21      m
wanger        bj   25      f


In [19]:
# 按列的值进行排序
print(base_info.sort_values(by="age"))
print(base_info.sort_values(by=["age","address"]))


          age address gender
lishi      18      bj      m
wangwu     20      nj      f
zhaoliu    21      sh      m
zhangsan   22      nj      f
wanger     25      bj      f
          age address gender
lishi      18      bj      m
wangwu     20      nj      f
zhaoliu    21      sh      m
zhangsan   22      nj      f
wanger     25      bj      f
