蔬菜类商品不同品类或不同单品之间可能存在一定的关联关系，请分析蔬菜各
品类及单品销售量的分布规律及相互关系。

读取数据

In [None]:
class DataProcess:
    def __init__(self):
        self.all_info_df  = [] ## 商品信息
        self.all_sell_df = [] ## 销售记录
        self.all_supply_df = [] ## 批发记录
        self.all_loss_df = [] ## 各类商品的损耗率
        self.drop_refund = pd.DataFrame
        self.daily_sales_total = pd.DataFrame ## 每日单品销售量总计
        self.monthly_sales_total = pd.DataFrame ## 每日单品销售量总计


    def read_data(self):
        ## 读取数据
        self.all_info_df  = pd.read_excel('.\data\info.xlsx') ## 商品信息
        self.all_sell_df = pd.read_excel('.\data\sell.xlsx') ## 销售记录
        self.all_supply_df = pd.read_excel('.\data\supply_price.xlsx') ## 批发记录
        self.all_loss_df = pd.read_excel('.\data\loss.xlsx') ## 各类商品的损耗率

    def drop_refund_func(self):
        df = self.all_sell_df
        filtered_df = df[df['销售类型']=='退货']
        result_df = df.drop(filtered_df.index, inplace=True)
        self.drop_refund = result_df

    def grant_daily_sales(self): ## 计算每个单品每日销售总量
        data = self.all_sell_df
        # 将日期和销量列转换为合适的数据类型
        data['销售日期'] = pd.to_datetime(data['销售日期'])
        data['销量(千克)'] = pd.to_numeric(data['销量(千克)'])

        # 分组并计算每一天的销量总额
        daily_sales_total = data.groupby(['单品编码', pd.Grouper(key='销售日期', freq='D')])['销量(千克)'].sum()
        self.daily_sales_total = daily_sales_total.to_frame().reset_index()

    def grant_monthly_sales(self): ## 计算每个单品每月销售总量
        data = self.all_sell_df

        # 将日期和销量列转换为合适的数据类型
        data['销售日期'] = pd.to_datetime(data['销售日期'])
        data['销量(千克)'] = pd.to_numeric(data['销量(千克)'])
        # 提取年月
        data['月份'] = data['销售日期'].dt.month
        data['年份'] = data['销售日期'].dt.year

        # 按单品编码和月份汇总销售量
        monthly_sales = data.groupby(['单品编码', '月份', '年份'])['销量(千克)'].sum()
        # 分组并计算每一天的销量总额
        self.monthly_sales_total = monthly_sales.to_frame().reset_index()

Main = DataProcess()
Main.read_data()

In [None]:
Main.drop_refund_func() ## 统计观测时, 退货记录舍去
Main.grant_daily_sales()
Main.grant_monthly_sales()

In [None]:
Main.daily_sales_total.to_csv('每日单品销售总量.csv', index = False)
Main.monthly_sales_total.to_csv('每月单品销售总量.csv', index = False)

定义单品、种类之间的映射函数

In [None]:
def grant_one_kind_sales(id_, sell_df):
    ans = sell_df[sell_df['单品编码']==id_]
    return ans

def grant_single_name(id_, info_df):
    ans = info_df[info_df['单品编码']==id_]['单品名称'].tolist()
    return ans[0]

def grant_kind(id_, info_df):
    my = info_df[['单品编码','分类编码']]
    ans = my[my['单品编码']==id_]['分类编码'].tolist()
    return ans[0]

def grant_kind_name(id_, info_df):
    my = info_df[['单品编码','分类名称']]
    ans = my[my['单品编码']==id_]['分类名称'].tolist()
    return ans[0]

def grant_all_kind_name(info_df):
    my = info_df['分类名称'].unique()
    ans = my.tolist()
    return ans

In [None]:
def create_dir(p):
    if not os.path.exists(p):
        os.makedirs(p)
create_dir('.\sales_daily')
create_dir('.\sales_monthly')
create_dir('.\describe')
create_dir('.\describe\daily')
create_dir('.\describe\monthly')

单品每日销量绘图

In [None]:
def plot_daily_sales_single(name, kind_name, df, Type='D'):
    df['销售日期'] = pd.to_datetime(df['销售日期'])
    # 按单品编码和销售日期分组，并计算销售总额
    daily_sales_total = df.groupby(['单品编码', pd.Grouper(key='销售日期', freq=Type)])['销量(千克)'].sum()

    # 重置索引，将多级索引转换为列
    daily_sales_total = daily_sales_total.reset_index()

    # 按年份绘制散点图
    for i, year in enumerate([2020, 2021, 2022, 2023]):
        # 筛选指定年份的数据
        year_data = daily_sales_total[daily_sales_total['销售日期'].dt.year == year]
        plt.figure()

        for j in range(1,13):
            y_data = year_data[year_data['销售日期'].dt.month == j]
            y_data['销售日期'] = y_data['销售日期'].dt.strftime('%m-%d')
            plt.plot(y_data['销售日期'], y_data['销量(千克)'], c='r', marker = '.')

        x = year_data['销售日期'].dt.strftime('%m-%d').to_list()
        if x:
            plt.xticks([x[0], x[-1]])
        plt.title(f'Plot {year}')
        plt.xlabel('Date')
        plt.ylabel('Sales/Kg')
        plt.grid(True, linestyle = '--', alpha = 0.5)
        ## 单品每日
        create_dir(f'./销售折线图/单类/每日/{kind_name}')
        path_here = './销售折线图/单类/每日/' + kind_name + f'/{name}/单品每日销售曲线{year}.png'
        plt.savefig(path_here)  # 保存图片
        plt.close()

def plot_all_sales_single(df):
    all_kind_id = Main.all_info_df['单品编码'].to_list()
    for name in grant_all_kind_name(Main.all_info_df):
        # 构造子文件夹路径和文件路径
        create_dir(f'./销售折线图/单类/每日/{name}/')

    for id_ in all_kind_id:
        single_ = grant_single_name(id_, Main.all_info_df)
        if single_ == '西峡香菇(份) ':
            single_ = '西峡香菇(份)'
        name = grant_kind_name(id_, Main.all_info_df)
        folder_path = os.path.join('./销售折线图/单类/每日/', name)
        path = os.path.join(folder_path, single_)
        if not os.path.exists(path):
            os.makedirs(path)

        tmp_df = df[df['单品编码']==id_]
        create_dir(f'./销售量数据/单类/每日/{name}')
        pth_csv = os.path.join(f'./销售量数据/单类/每日/{name}/', f'每日销售量_{single_}.csv' )
        tmp_df.to_csv(pth_csv, index=False)
        kind_ = grant_kind_name(id_, Main.all_info_df)
        print(f'here!!! id: {id_}, {single_}, {kind_}')
        plot_daily_sales_single(single_, kind_, tmp_df)

In [None]:
import warnings
warnings.filterwarnings("ignore")

plot_all_sales_single(Main.daily_sales_total)

单品每月销量绘图

In [None]:
def plot_monthly_sales_single(name, kind_name, df_here):
    daily_sales_total = df_here
    # 按年份绘制散点图
    for i, year in enumerate([2020, 2021, 2022, 2023]):
        # 筛选指定年份的数据
        year_data = daily_sales_total[daily_sales_total['年份'] == year]
        plt.figure()

        for j in range(1,13):
            y_data = year_data[year_data['月份'] == j]
            plt.plot(y_data['月份'], y_data['销量(千克)'], 'bo-')

        x = year_data['月份'].to_list()
        if x:
            plt.xticks([x[0], x[-1]])
        plt.title(f'Plot {year}')
        plt.xlabel('Month')
        plt.ylabel('Sales/Kg')
        plt.grid(True, linestyle = '--', alpha = 0.5)
        ## 单品每日
        create_dir(f'./销售折线图/单类/每月/{kind_name}')
        path_Here = './销售折线图/单类/每月/' + kind_name + f'/{name}/单品每月销售曲线{year}.png'
        plt.savefig(path_Here)  # 保存图片
        plt.close()

def plot_all_sales_single_monthly(df_temp):
    all_kind_id = Main.all_info_df['单品编码'].to_list()
    for name in grant_all_kind_name(Main.all_info_df):
        # 构造子文件夹路径和文件路径
        create_dir(f'./销售折线图/单类/每月/{name}/')

    for id_ in all_kind_id:
        single_ = grant_single_name(id_, Main.all_info_df)
        if single_ == '西峡香菇(份) ':
            single_ = '西峡香菇(份)'
        name = grant_kind_name(id_, Main.all_info_df)
        folder_path = os.path.join('./销售折线图/单类/每月/', name)
        path = os.path.join(folder_path, single_)
        if not os.path.exists(path):
            os.makedirs(path)

        tmp_df = df_temp[df_temp['单品编码'] == id_]
        create_dir(f'./销售量数据/单类/每月/{name}')
        pth_csv = os.path.join(f'./销售量数据/单类/每月/{name}/', f'每月销售量_{single_}.csv' )
        kind_ = grant_kind_name(id_, Main.all_info_df)
        tmp_df.to_csv(pth_csv, index=False)
        print(f'here!!! id: {id_}, {single_}, {kind_}')
        plot_monthly_sales_single(single_, kind_, tmp_df)

In [None]:
import warnings
warnings.filterwarnings("ignore")

plot_all_sales_single_monthly(Main.monthly_sales_total)

大类日销量绘图

In [None]:
def calculate_daily_sales_by_category():
    info_df = Main.all_info_df
    df_here = Main.daily_sales_total

    # 将销售日期列转换为日期类型
    df_here['销售日期'] = pd.to_datetime(df_here['销售日期'])

    # 按单品编码和销售日期分组，并计算每天的总销量
    daily_sales = df_here.groupby(['单品编码', pd.Grouper(key='销售日期', freq='D')])['销量(千克)'].sum().reset_index()

    # 添加大类编码列
    daily_sales['大类编码'] = daily_sales['单品编码'].apply(lambda x: grant_kind(x, info_df))

    # 按大类编码和日期分组，并计算每天的大类总销量
    daily_sales_total = daily_sales.groupby(['大类编码', pd.Grouper(key='销售日期', freq='D')])['销量(千克)'].sum().reset_index()

    # 填充缺失值为0
    daily_sales_total['销量(千克)'] = daily_sales_total['销量(千克)'].fillna(0)

    return daily_sales_total

import warnings
warnings.filterwarnings("ignore")
kind_grouped = calculate_daily_sales_by_category()

In [None]:
def plot_daily_sales_kind(kind_name, df_s):
    df_s['销售日期'] = pd.to_datetime(df_s['销售日期'])
    daily_sales_total = df_s

    # 按年份绘制散点图
    for i, year in enumerate([2020, 2021, 2022, 2023]):
        # 筛选指定年份的数据
        year_data = daily_sales_total[daily_sales_total['销售日期'].dt.year == year]
        plt.figure()

        for j in range(1,13):
            y_data = year_data[year_data['销售日期'].dt.month == j]
            y_data['销售日期'] = y_data['销售日期'].dt.strftime('%m-%d')
            plt.plot(y_data['销售日期'], y_data['销量(千克)'], c='r', marker = '.')

        x = year_data['销售日期'].dt.strftime('%m-%d').to_list()
        if x:
            plt.xticks([x[0], x[-1]])
        plt.title(f'Plot {year}')
        plt.xlabel('Date')
        plt.ylabel('Sales/Kg')
        plt.grid(True, linestyle = '--', alpha = 0.5)
        create_dir(f'./销售折线图/品类/每日/{kind_name}')
        path = './销售折线图/品类/每日/' + kind_name + f'/{kind_name}_{year}_sales_curve.png'
        plt.savefig(path)  # 保存图片
        plt.close()

In [None]:
def grant_kinds_name(kind_id, info_df):
    ans = info_df[info_df['分类编码']==kind_id]['分类名称'].tolist()
    return ans[0]

In [None]:
import os
def plot_all_sales_kind(Df):
    all_kinds_id = Main.all_info_df['分类编码'].to_list()
    for id_ in all_kinds_id:
        name = grant_kinds_name(id_, Main.all_info_df)
        create_dir(f'./销售量数据/品类/每日/')
        tmp_df = Df[Df['大类编码'] == id_]
        pth_csv = os.path.join('./销售量数据/品类/每日/', f'{name}日总销量.csv')
        tmp_df.to_csv(pth_csv, index=False)
        kind_ = name
        plot_daily_sales_kind(kind_, tmp_df)

In [None]:
plot_all_sales_kind(kind_grouped)

大类月销量绘图

In [None]:
def calculate_monthly_sales_by_category():
    info_df = Main.all_info_df
    df_here = Main.monthly_sales_total

    # 添加大类编码列
    df_here['大类编码'] = df_here['单品编码'].apply(lambda x: grant_kind(x, info_df))
    # 合并年份和月份为年月列
    df_here['年月'] = pd.to_datetime(df_here['年份'].astype(str) + '-' + df_here['月份'].astype(str), format='%Y-%m')
    # 按大类编码和日期分组，并计算每天的大类总销量
    daily_sales_total = df_here.groupby(['大类编码', pd.Grouper(key='年月')])['销量(千克)'].sum().reset_index()

    # 填充缺失值为0
    daily_sales_total['销量(千克)'] = daily_sales_total['销量(千克)'].fillna(0)

    return daily_sales_total

import warnings
warnings.filterwarnings("ignore")
kind_grouped = calculate_monthly_sales_by_category()

In [None]:
def plot_monthly_sales_kind(kind_name, df_s):
    daily_sales_total = df_s
    # 按年份绘制散点图
    for i, year in enumerate([2020, 2021, 2022, 2023]):
        # 筛选指定年份的数据
        year_data = daily_sales_total[daily_sales_total['年月'].dt.year == year]
        plt.figure()

        for j in range(1,13):
            y_data = year_data[year_data['年月'].dt.month == j]
            # y_data['年月'] = y_data['年月'].dt.strftime('%m')
            plt.plot(y_data['年月'], y_data['销量(千克)'], marker = '.', linestyle = '-', markersize = 16)
        # x = year_data['年月'].dt.strftime('%m').to_list()
        # if x:
        #     plt.xticks([x[0], x[-1]])
        plt.title(f'Plot {year}')
        plt.xlabel('Month')
        plt.ylabel('Sales/Kg')
        plt.grid(True, linestyle = '--', alpha = 0.5)
        create_dir(f'./销售折线图/品类/每月/{kind_name}/')
        path = './销售折线图/品类/每月/' + kind_name + f'/{kind_name}_{year}_sales_curve.png'
        plt.savefig(path)  # 保存图片
        plt.close()

def plot_all_sales_kind_monthly(Df):
    all_kinds_id = Main.all_info_df['分类编码'].to_list()
    for id_ in all_kinds_id:
        name = grant_kinds_name(id_, Main.all_info_df)
        tmp_df = Df[Df['大类编码'] == id_]
        pth_csv = os.path.join('./销售量数据/品类/每月/', f'{name}月总销量.csv')
        tmp_df.to_csv(pth_csv, index=False)
        kind_ = name
        plot_monthly_sales_kind(kind_, tmp_df)

In [None]:
plot_all_sales_kind_monthly(kind_grouped)

单品日销量数据指标

In [None]:
Kinds = ['花菜类','花叶类','辣椒类','茄类','食用菌','水生根茎类']
pre = './sales_daily/'
Describe = []
def grant_description_for_kind():
    for kind in Kinds:
        folds = os.path.join(pre + kind)
        files = kind + '日总销量.csv'
        paths = os.path.join(folds, files)
        dfs = pd.read_csv(paths)
        Describe.append(dfs)

    # 将类别和描述统计信息添加到DataFrame中
    for i in range(6):
        kind = Kinds[i]
        describe = Describe[i].describe()
        dfs = pd.DataFrame(describe).reset_index()
        # 将DataFrame保存为CSV文件
        dfs.to_csv(f'./describe/daily/{kind}日销量描述统计信息.csv', index = False)
        print(f'{kind}:\n {dfs}')

## 每个大类的描述性文本展示与保存
grant_description_for_kind()

每月单品销量数据指标

In [None]:
Kinds = ['花菜类','花叶类','辣椒类','茄类','食用菌','水生根茎类']
pre = './sales_monthly/'
Describe = []
def grant_description_for_kind():
    for kind in Kinds:
        folds = os.path.join(pre + kind)
        files = kind + '月总销量.csv'
        paths = os.path.join(folds, files)
        dfs = pd.read_csv(paths)
        Describe.append(dfs)

    # 将类别和描述统计信息添加到DataFrame中
    for i in range(6):
        kind = Kinds[i]
        describe = Describe[i].describe()
        dfs = pd.DataFrame(describe).reset_index()
        # 将DataFrame保存为CSV文件
        dfs.to_csv(f'./describe/monthly/{kind}月销量述统计信息.csv', index = False)
        print(f'{kind}:\n {dfs}')

## 每个大类的描述性文本展示与保存
grant_description_for_kind()

绘制频次直方图

In [None]:
from datetime import timedelta, datetime

def convert(DF, win_size = 2, lap = 2):
    if '销售日期' in DF.columns:
        # 将销售日期转换为日期类型
        DF['销售日期'] = pd.to_datetime(DF['销售日期'])

    else:
        DF['销售日期'] = pd.to_datetime(DF['年月'])
    # 提取年份
    DF['年份'] = DF['销售日期'].dt.year
    # 根据年份分组并提取日期列表
    date_lists = []
    for year in range(2020, 2024):
        year_data = DF[DF['年份'] == year]
        dates = year_data['销售日期'].tolist()
        date_lists.append(dates)
    # # 输出每年的日期列表
    # for year, dates in zip(range(2020, 2024), date_lists):
    #     print(f'{year}: {dates}')

    # 修改列表中的数
    def modify_list(numbers, W_S, Lap):
        modified_x, modified_y = [], []
        i = 0
        while i < len(numbers) - W_S + 1:
            window = numbers[i: i + W_S]
            w = window[0]
            w = str(w)
            w = pd.to_datetime(w)
            modified_x.append(w)
            condition = (DF['销售日期']>= w) & (DF['销售日期'] < (w + timedelta(days=Lap)))
            y = DF[condition]
            y = y['销量(千克)'].tolist()
            modified_y.append(sum(y))
            i += Lap
        return modified_x, modified_y
    Mo_x, Mo_y = [], []
    for dates in date_lists:
        A, B = modify_list(dates, win_size, lap)
        Mo_x.append(A)
        Mo_y.append(B)
    return Mo_x, Mo_y

In [None]:
import pandas as pd
Kinds = ['花菜类','花叶类','辣椒类','茄类','食用菌','水生根茎类']
pre = './sales_daily/'

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
def myPlot(b, path, kind, name):
    for i in range(0, 4): ##从2020年到2023年一共4年里
        y = b[i]
        # x = [a*10 for a in range(2, len(y)+1)]
        sns.distplot(y, bins=12, hist=True, kde=True, rug=False)
        plt.ylabel('Freq')
        plt.grid(True, linestyle = '--', alpha = 0.5)
        # 添加密度曲线的图例
        plt.legend(['Density'])
        year = 2020+i
        plt.title(f'Fre-Histogram: Year:{year}, win: 4  lap: 2')
        # plt.show()
        path_x = path + f'/{name}/'
        create_dir(path_x)
        path_x = path_x + f'/{year}.png'
        plt.savefig(path_x)
        plt.close()

In [None]:
create_dir('./销量频率直方图')
create_dir('./销量频率直方图/单类/')
create_dir('./销量频率直方图/品类/')
create_dir('./销量频率直方图/单类/每日/')
create_dir('./销量频率直方图/单类/每月/')
create_dir('./销量频率直方图/品类/每月/')
create_dir('./销量频率直方图/品类/每日/')

single_path_daily = './销售量数据/单类/每日/'
single_path_monthly = './销售量数据/单类/每月/'
kind_path_daily = './销售量数据/品类/每日/'
kind_path_monthly = './销售量数据/品类/每月/'
src_list = [single_path_daily, single_path_monthly, kind_path_daily, kind_path_monthly]

obj_SD = './销量频率直方图/单类/每日/'
obj_SM = './销量频率直方图/单类/每月/'
obj_KD = './销量频率直方图/品类/每日/'
obj_KM = './销量频率直方图/品类/每月/'
obj_list = [obj_SD, obj_SM, obj_KD, obj_KM]

for k in Kinds:
    for p in obj_list:
        create_dir(p+k)

for i in range(4):
    src = src_list[i]
    for k in Kinds:
        fold = os.path.join(src, k)
        if os.path.exists(fold):
            fold = fold
        else: fold = src
        f_list = os.listdir(fold)
        for f in f_list:
            df_ = pd.read_csv(os.path.join(fold, f))
            window_size, lap = 4, 2
            a, b = convert(df_, window_size, lap)
            path = obj_list[i]+k+'/'
            name = ''.join(f.split('.')[0])[6:]
            myPlot(b, path, k, name)
            print(f'{name} complete')