In [1]:
import os
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = 'SimHei'   # 显示中文

In [2]:
data = pd.read_csv('data_wuliu.csv', encoding='gbk')
data

Unnamed: 0,订单号,订单行,销售时间,交货时间,货品交货状况,货品,货品用户反馈,销售区域,数量,销售金额
0,P096311,10,2016/7/30,2016/9/30,晚交货,货品3,质量合格,华北,2.0,"1052,75元"
1,P096826,10,2016/8/30,2016/10/30,按时交货,货品3,质量合格,华北,10.0,"11,50万元"
2,,20,2016/8/30,2016/10/30,按时交货,货品3,质量合格,华北,10.0,"11,50万元"
3,P097435,10,2016/7/30,2016/9/30,按时交货,货品1,返修,华南,2.0,"6858,77元"
4,P097446,60,2016/11/26,2017/1/26,晚交货,货品3,质量合格,华北,15.0,"129,58元"
...,...,...,...,...,...,...,...,...,...,...
1156,P299901,10,2016/12/15,2017/3/15,按时交货,货品6,质量合格,马来西亚,2.0,"200,41元"
1157,P302956,10,2016/12/22,2017/3/22,按时交货,货品2,拒货,华东,20.0,"79,44元"
1158,P303801,10,2016/12/15,2017/3/15,按时交货,货品2,质量合格,华东,1.0,"194,08元"
1159,P307276,10,2016/12/22,2017/3/22,按时交货,货品6,质量合格,马来西亚,1.0,"32,18元"


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1161 entries, 0 to 1160
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   订单号     1159 non-null   object 
 1   订单行     1161 non-null   int64  
 2   销售时间    1161 non-null   object 
 3   交货时间    1161 non-null   object 
 4   货品交货状况  1159 non-null   object 
 5   货品      1161 non-null   object 
 6   货品用户反馈  1161 non-null   object 
 7   销售区域    1161 non-null   object 
 8   数量      1157 non-null   float64
 9   销售金额    1161 non-null   object 
dtypes: float64(1), int64(1), object(8)
memory usage: 90.8+ KB


- 订单号、货品交货情况、数量存在缺失值
- 订单行无关紧要，可删除
- 销售金额格式不对，数据类型要改成int/float

In [4]:
# 删除重复记录
data.drop_duplicates(keep='first',inplace=True)

# 删除缺失值
data.dropna(axis=0,how='any',inplace=True)

#删除订单行
# 只要运行一次，再次运行由于已删会报错
# data.drop(columns=['订单行'],inplace=True,axis=1) 
# 改成如果存在则删除
if '订单行' in data.columns:
    data.drop(columns=['订单行'], inplace=True)
print(data.info())
# 更新索引
data.reset_index(drop=True,inplace=True)

<class 'pandas.core.frame.DataFrame'>
Index: 1146 entries, 0 to 1160
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   订单号     1146 non-null   object 
 1   销售时间    1146 non-null   object 
 2   交货时间    1146 non-null   object 
 3   货品交货状况  1146 non-null   object 
 4   货品      1146 non-null   object 
 5   货品用户反馈  1146 non-null   object 
 6   销售区域    1146 non-null   object 
 7   数量      1146 non-null   float64
 8   销售金额    1146 non-null   object 
dtypes: float64(1), object(8)
memory usage: 89.5+ KB
None


In [5]:
# 对销售金额列进行数据清洗（有的是“元”，有的是“万元”）
# 自定义过滤函数（删除逗号，处理万元，转成float）
def data_deal(number):
    if '万元' in number:
        number_new = float(number.replace('万元', '').replace(',', '')) * 10000
    elif '元' in number:
        number_new = float(number.replace('元', '').replace(',', ''))
    return number_new
    
data['销售金额'] = data['销售金额'].map(data_deal)
data

Unnamed: 0,订单号,销售时间,交货时间,货品交货状况,货品,货品用户反馈,销售区域,数量,销售金额
0,P096311,2016/7/30,2016/9/30,晚交货,货品3,质量合格,华北,2.0,105275.0
1,P096826,2016/8/30,2016/10/30,按时交货,货品3,质量合格,华北,10.0,11500000.0
2,P097435,2016/7/30,2016/9/30,按时交货,货品1,返修,华南,2.0,685877.0
3,P097446,2016/11/26,2017/1/26,晚交货,货品3,质量合格,华北,15.0,12958.0
4,P097446,2016/11/26,2017/1/26,晚交货,货品3,拒货,华北,15.0,3239.0
...,...,...,...,...,...,...,...,...,...
1141,P299901,2016/12/15,2017/3/15,按时交货,货品6,质量合格,马来西亚,2.0,20041.0
1142,P302956,2016/12/22,2017/3/22,按时交货,货品2,拒货,华东,20.0,7944.0
1143,P303801,2016/12/15,2017/3/15,按时交货,货品2,质量合格,华东,1.0,19408.0
1144,P307276,2016/12/22,2017/3/22,按时交货,货品6,质量合格,马来西亚,1.0,3218.0


In [6]:
data.describe()

Unnamed: 0,数量,销售金额
count,1146.0,1146.0
mean,76.069372,122348.8
std,589.416486,1114599.0
min,1.0,0.0
25%,1.0,2941.5
50%,1.0,9476.5
75%,4.0,35767.75
max,11500.0,32700000.0


- 销售金额最小值0
- 数据右偏明显（电商领域常见，不用管）

In [7]:
# 删除销售金额0的（数据量小，可以删除）
data = data[data['销售金额']!=0]
data.describe()

Unnamed: 0,数量,销售金额
count,1145.0,1145.0
mean,76.134934,122455.7
std,589.669861,1115081.0
min,1.0,51.0
25%,1.0,2946.0
50%,1.0,9486.0
75%,4.0,35773.0
max,11500.0,32700000.0


In [9]:
# 数据规整：增加辅助列——月份
data['销售时间'] = pd.to_datetime(data['销售时间'])
data['月份'] = data['销售时间'].apply(lambda x:x.month)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['销售时间'] = pd.to_datetime(data['销售时间'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['月份'] = data['销售时间'].apply(lambda x:x.month)


***$\LARGE\text{1.配送服务是否存在问题}$***

In [16]:
# 分析交货情况
data['货品交货状况'] = data['货品交货状况'].str.strip()  # 去除货品交货情况前后空格
data1 = data.groupby(['月份', '货品交货状况']).size().unstack()  # 多列分组，用unstack来以行列形式显示。
data1['按时交货率'] = data1['按时交货'] / (data1['按时交货'] + data1['晚交货'])
data1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['货品交货状况'] = data['货品交货状况'].str.strip()  # 去除货品交货情况前后空格


货品交货状况,按时交货,晚交货,按时交货率
月份,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
7,189,13,0.935644
8,218,35,0.86166
9,122,9,0.931298
10,238,31,0.884758
11,101,25,0.801587
12,146,18,0.890244


In [22]:
# 分析销售区域
data1 = data.groupby(['销售区域', '货品交货状况']).size().unstack()  # 多列分组，用unstack来以行列形式显示。
data1['按时交货率'] = data1['按时交货'] / (data1['按时交货'] + data1['晚交货'])
print(data1.sort_values(by='按时交货率',ascending=False))  # 降序排序， 西北送货延迟问题严重

货品交货状况  按时交货  晚交货     按时交货率
销售区域                       
泰国       183    4  0.978610
马来西亚     310   16  0.950920
华南        10    1  0.909091
华北       226   27  0.893281
华东       268   39  0.872964
西北        17   44  0.278689


In [25]:
# 分析货品
data1 = data.groupby(['货品', '货品交货状况']).size().unstack()  # 多列分组，用unstack来以行列形式显示。
data1['按时交货率'] = data1['按时交货'] / (data1['按时交货'] + data1['晚交货'])
print(data1.sort_values(by='按时交货率',ascending=False))  # 货品4晚交货严重

货品交货状况  按时交货  晚交货     按时交货率
货品                         
货品5      183    4  0.978610
货品6      309    7  0.977848
货品1       27    2  0.931034
货品3      212   26  0.890756
货品2      269   48  0.848580
货品4       14   44  0.241379


In [26]:
# 货品和销售区域结合
# 分析货品
data1 = data.groupby(['货品', '销售区域', '货品交货状况']).size().unstack()  # 多列分组，用unstack来以行列形式显示。
data1['按时交货率'] = data1['按时交货'] / (data1['按时交货'] + data1['晚交货'])
print(data1.sort_values(by='按时交货率',ascending=False))  
# 销售区域最差在西北，货品1和4，主要是货品4送货较晚
# 货品最差货品2，送往华东和马来西亚，主要是马来西亚送货较晚

货品交货状况     按时交货   晚交货     按时交货率
货品  销售区域                       
货品5 泰国    183.0   4.0  0.978610
货品6 马来西亚  309.0   7.0  0.977848
货品1 华北     14.0   1.0  0.933333
    华南     10.0   1.0  0.909091
货品3 华北    212.0  26.0  0.890756
货品2 华东    268.0  39.0  0.872964
货品4 西北     14.0  44.0  0.241379
货品2 马来西亚    1.0   9.0  0.100000
货品1 西北      3.0   NaN       NaN


***$\LARGE\text{2.是否存在有潜力的销售区域}$***