# 第5章　使用`pandas`进行数据预处理

## 任务5.1　合并数据

<div style="text-align: center;">
    <b>代码5-1　索引完全相同时的横向堆叠</b>
</div>

In [1]:
import numpy
import pandas
from sqlalchemy import create_engine

connection = create_engine("mssql+pymssql://sa:123456@localhost:1433/testdb?charset=utf8")
detail_1 = pandas.read_sql('meal_order_detail1', connection)
dataFrame_1 = detail_1.iloc[:, :10]
dataFrame_2 = detail_1.iloc[:, 10:]
print('合并dataFrame_1的大小为%s，dataFrame_2的大小为：%s' % (dataFrame_1.shape, dataFrame_2.shape))
print('外连接合并后的数据框大小为：', pandas.concat([dataFrame_1, dataFrame_2], axis=1, join='outer').shape)
print('内连接合并后的数据框大小为：', pandas.concat([dataFrame_1, dataFrame_2], axis=1, join='inner').shape)

合并dataFrame_1的大小为(2779, 10)，dataFrame_2的大小为：(2779, 9)
外连接合并后的数据框大小为： (2779, 19)
内连接合并后的数据框大小为： (2779, 19)


<br />
<div style="text-align: center;">
    <b>代码5-2　表名完全相同时的<code>concat</code>纵向堆叠</b>
</div>

In [2]:
dataFrame_3 = detail_1.iloc[:1500, :]
dataFrame_4 = detail_1.iloc[1500:, :]
print('合并dataFrame_3的大小为%s，dataFrame_4的大小为%s' % (dataFrame_3.shape, dataFrame_4.shape))
print(
    '外连接纵向合并后的数据框大小为：',
    pandas.concat([dataFrame_3, dataFrame_4], axis=1, join='outer').shape
)
print(
    '内连接纵向合并后的数据框大小为：',
    pandas.concat([dataFrame_3, dataFrame_4], axis=1, join='inner').shape
)

合并dataFrame_3的大小为(1500, 19)，dataFrame_4的大小为(1279, 19)
外连接纵向合并后的数据框大小为： (2779, 38)
内连接纵向合并后的数据框大小为： (0, 38)


<br />
<center>
    <b>代码5-3　使用<code>append</code>方法进行纵向表堆叠</b>
</center>

In [3]:
print('堆叠前dataFrame_3的大小为%s，dataFrame_4的大小为%s' % (dataFrame_3.shape, dataFrame_4.shape))
print('append纵向堆叠后的数据框大小为：', dataFrame_3.append(dataFrame_4).shape)

堆叠前dataFrame_3的大小为(1500, 19)，dataFrame_4的大小为(1279, 19)
append纵向堆叠后的数据框大小为： (2779, 19)


<br />
<center>
    <b>代码5-4　使用<code>merge</code>函数合并数据表</b>
</center>

In [4]:
order = pandas.read_csv('data/meal_order_info.csv', sep=',', encoding='gbk')
order['info_id'] = order['info_id'].astype('str')
order_detail_1 = pandas.merge(detail_1, order, left_on='order_id', right_on='info_id')
print('detail_1订单详情表的原始形状为：', detail_1.shape)
print('order订单信息表的原始形状为：', order.shape)
print('订单详情表和订单信息表逐渐合并后的形状为：', order_detail_1.shape)

detail_1订单详情表的原始形状为： (2779, 19)
order订单信息表的原始形状为： (945, 21)
订单详情表和订单信息表逐渐合并后的形状为： (2779, 40)


<br />
<center>
    <b>代码5-5　使用<code>join</code>方法实现主键合并</b>
</center>

In [5]:
%xmode Minimal
order.rename({'info_id': 'order_id'}, inplace=True)
order_detail_1 = detail_1.join(order, on='order_id', rsuffix='1')
print('订单详情表和订单信息表join合并后的形状为：', order_detail_1.shape)

Exception reporting mode: Minimal


ValueError: You are trying to merge on object and int64 columns. If you wish to proceed you should use pd.concat

<br /><center><b>代码5-6　重叠合并</b></center>

In [6]:
dictionary_1 = {
    'ID': [1, 2, 3, 4, 5, 6, 7, 8, 9],
    'System': ['win10', 'win10', numpy.nan, 'win10', numpy.nan, numpy.nan, 'win7', 'win7', 'win8'],
    'cpu': ['i7', 'i5', numpy.nan, 'i7', numpy.nan, numpy.nan, 'i5', 'i5', 'i3']
}
dictionary_2 = {
    'ID': [1, 2, 3, 4, 5, 6, 7, 8, 9],
    'System': [
        numpy.nan, numpy.nan, 'win7', numpy.nan, 'win8',
        'win7', numpy.nan, numpy.nan, numpy.nan
    ],
    'cpu': [numpy.nan, numpy.nan, 'i3', numpy.nan, 'i7', 'i5', numpy.nan, numpy.nan, numpy.nan]
}
dataFrame_5 = pandas.DataFrame(dictionary_1)
dataFrame_6 = pandas.DataFrame(dictionary_2)
print('经过重叠合并后的数据为：', dataFrame_5.combine_first(dataFrame_6), sep='\n')

经过重叠合并后的数据为：
   ID System cpu
0   1  win10  i7
1   2  win10  i5
2   3   win7  i3
3   4  win10  i7
4   5   win8  i7
5   6   win7  i5
6   7   win7  i5
7   8   win7  i5
8   9   win8  i3


<br /><center><b>代码5-7　将多张菜品订单详情表纵向合并</b></center>

In [7]:
import numpy
import pandas
from sqlalchemy import create_engine

connection = create_engine('mssql+pymssql://sa:123456@localhost:1433/testdb?charset=utf8')
detail_1 = pandas.read_sql('meal_order_detail1', connection)
detail_2 = pandas.read_sql('meal_order_detail2', connection)
detail_3 = pandas.read_sql('meal_order_detail3', connection)
detail = detail_1.append(detail_2)
detail = detail_1.append(detail_3)
print('3张订单详情表合并后的形状为：', detail.shape)

3张订单详情表合并后的形状为： (6390, 19)


<br /><center><b>代码5-8　订单详情表、订单信息表、客户信息表主键合并</b></center>

In [8]:
order = pandas.read_csv('data/meal_order_info.csv', sep=',', encoding='gbk')
user = pandas.read_excel('data/users_info.xlsx')
order['info_id'] = order['info_id'].astype('str')
order['emp_id'] = order['emp_id'].astype('str')
user['USER_ID'] = user['USER_ID'].astype('str')
data = pandas.merge(detail, order, left_on=['order_id', 'emp_id'], right_on=['info_id', 'emp_id'])
data = pandas.merge(data, user, left_on='emp_id', right_on='USER_ID', how='inner')
print('3张表数据主键合并后的大小为：', data.shape)

3张表数据主键合并后的大小为： (9848, 76)
