In [1]:
from os import times_result

import numpy as np
import pandas as pd
from pandas import DataFrame

### 替换操作
##### 替换操作可以同步作用于Series和DataFrame中
##### 单值替换
#####

In [2]:
df = DataFrame(data = np.random.randint(0, 100, size=(5,6)))

In [3]:
df.replace(to_replace= 77, value = 'Nine') # 普通替换

Unnamed: 0,0,1,2,3,4,5
0,15,13,28,17,19,57
1,79,83,74,2,15,71
2,39,64,32,83,17,18
3,81,78,62,89,39,30
4,36,80,37,32,48,42


In [4]:
df.replace(to_replace={0: 'eight'}) #字典替换

Unnamed: 0,0,1,2,3,4,5
0,15,13,28,17,19,57
1,79,83,74,2,15,71
2,39,64,32,83,17,18
3,81,78,62,89,39,30
4,36,80,37,32,48,42


In [5]:
df.replace(to_replace={3:5}, value= 'Nine')

Unnamed: 0,0,1,2,3,4,5
0,15,13,28,17,19,57
1,79,83,74,2,15,71
2,39,64,32,83,17,18
3,81,78,62,89,39,30
4,36,80,37,32,48,42


In [6]:
df.replace(to_replace={4:77}, value= 'nine') # 将指定列中具体的某一个元素进行替换， to_replace={列索引: 被替换的值}, value = ‘要替换的值’

Unnamed: 0,0,1,2,3,4,5
0,15,13,28,17,19,57
1,79,83,74,2,15,71
2,39,64,32,83,17,18
3,81,78,62,89,39,30
4,36,80,37,32,48,42


In [7]:
df.replace(to_replace={77: 'seven', 88: 'eight', 0:8})

Unnamed: 0,0,1,2,3,4,5
0,15,13,28,17,19,57
1,79,83,74,2,15,71
2,39,64,32,83,17,18
3,81,78,62,89,39,30
4,36,80,37,32,48,42


### 映射操作
#### - 概念： 创建一个映射关系列表，把values元素和一个特定的标签或者字符串绑定（给一个元素值不同的表现形式）
#### - 创建df, 两列分别是姓名和薪资， 然后给起名字取对应的英文名

In [8]:
dic = {
    'name': ['张三', '李四', '王五'],
    'Salary': [15000, 20000, 18000]
}
df = DataFrame(data = dic)
df

Unnamed: 0,name,Salary
0,张三,15000
1,李四,20000
2,王五,18000


In [9]:
#映射表
dic = {
    '张三': 'tom',
    '李四': 'jack',
    '王五': 'rouse'
}
df['e_name'] = df['name'].map(dic) # 这里的map是Series里的函数,是映射需要的函数
df

Unnamed: 0,name,Salary,e_name
0,张三,15000,tom
1,李四,20000,jack
2,王五,18000,rouse


#### map 除了可以做映射工具,也可以做运算工具

In [10]:
#薪资超过5000的部分, 收取20%的个人所得税, 计算每个人的税后薪资
def after_sal(s):
  return  (s - (s-5000) * 0.2) if s > 5000 else s

df['after_sal'] = df['Salary'].map(after_sal) # 可以将['salary'] 这个Series中的每个元素(薪资)作为参数传递给s
df

Unnamed: 0,name,Salary,e_name,after_sal
0,张三,15000,tom,13000.0
1,李四,20000,jack,17000.0
2,王五,18000,rouse,15400.0


#### 数据的分类处理
##### - 数据分类处理的两个部分
##### - groupby()函数
##### - group 属性查看分组情况

In [11]:
df = DataFrame({'item': ['Apple', 'Orange', 'Banana', 'Orange', 'Banana', 'Apple'],
               'price': [1, 2, 3, 4, 5, 6],
               'color': ['red', 'yellow', 'yellow', 'green', 'green', 'green'],
               'weight': [12, 20, 50, 30, 23, 12]})
df

Unnamed: 0,item,price,color,weight
0,Apple,1,red,12
1,Orange,2,yellow,20
2,Banana,3,yellow,50
3,Orange,4,green,30
4,Banana,5,green,23
5,Apple,6,green,12


### 分类需要有一个分组的条件

In [12]:
df.groupby(by='item').groups

{'Apple': [0, 5], 'Banana': [2, 4], 'Orange': [1, 3]}

In [13]:
# 计算每一种水果的评价价格
df.groupby(by='item')['price'].mean()

item
Apple     3.5
Banana    4.0
Orange    3.0
Name: price, dtype: float64

In [20]:
dic = df.groupby(by='color')['weight'].sum()

In [21]:
df['mean_w']=df['color'].map(dic)
df

Unnamed: 0,item,price,color,weight,mean_w
0,Apple,1,red,12,12
1,Orange,2,yellow,20,70
2,Banana,3,yellow,50,70
3,Orange,4,green,30,65
4,Banana,5,green,23,65
5,Apple,6,green,12,65


### 高级数据的聚合
- 使用groupby分组后, 也可以使用transform和apply提供自定义函数来实现更多计算
- transform和apply都可以进行远算

In [24]:
# 自己封装一个函数
def my_mean(s):
    m_sum = 0
    for i in s:
        m_sum += i
    return m_sum/len(s)

df.groupby('item')['price'].transform(my_mean)

0    3.5
1    3.0
2    4.0
3    3.0
4    4.0
5    3.5
Name: price, dtype: float64

In [25]:
df.groupby('item')['price'].apply(my_mean)

item
Apple     3.5
Banana    4.0
Orange    3.0
Name: price, dtype: float64

### 数据的加载
- 读取文件的数据

In [26]:
df = pd.read_csv("./file/f.txt")
df

Unnamed: 0,0,hello
0,1,who are you
1,2,oh no
2,3,you are fuck a gay
3,4,get out


In [27]:
df.shape

(4, 2)

- 将文件中每一个词作为一个元素放到dataFrame中
- 用header = None 去掉头, 用sep 去掉分隔符

In [28]:
pd.read_csv('./file/f.txt', header=None, sep=',')

Unnamed: 0,0,1
0,0,hello
1,1,who are you
2,2,oh no
3,3,you are fuck a gay
4,4,get out
