In [2]:
### 加载数据
import pandas
import pickle
import numpy
from pyecharts.charts import Bar
from pyecharts.charts import Pie
from pyecharts import options as opts

SAVE_DATA = False
dataset = pandas.read_csv('dataset.csv')
print(f'Total data: {dataset.shape[0]}')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
Total data: 800000


In [3]:
### 清洗数据
# 洗掉不必要的列
data = dataset.drop(
    [
        'Unnamed: 0',
        '提交答卷时间',
        '所用时间',
        '来源',
        '来源详情',
        '来自IP',
        '1、请问您的学校所在地区和类别：'
    ],
    axis=1
)

# 列名含义，接下来使用下标访问
COLS_NAME = data.columns.values.tolist()
data.columns = [idx for idx in range(len(COLS_NAME))]

# 地区有两个-3的异常值 清洗
data = data[data[91] != -3]

# 14题数据为1or2 因此全体-1 统一处理
data.iloc[:,53:58] = data.iloc[:,53:58] - 1

# 线上学习时间有错误值及空值 0~15
data = data[data[22].isin([x for x in range(16)])].astype(int)

# 输出最后行数
print(f'Data cleaning completed！Total:\n{data.shape[0]}')

Data cleaning completed！Total:
750841


In [4]:
# 导出数据csv
if SAVE_DATA:
    data.to_csv('./cooked.csv')
    with open('./COLS_NAME.dat', 'wb') as f:
        pickle.dump(COLS_NAME, f)

In [5]:
# 富文本饼图设置
PIE_SETTINGS = opts.LabelOpts(
    position="outside",
    formatter="{b|{b}: }{per|{d}%}  ",
    background_color="#eee",
    border_color="#aaa",
    border_width=1,
    border_radius=4,
    rich={
        "a": {"color": "#999", "lineHeight": 22, "align": "center"},
        "abg": {
            "backgroundColor": "#e3e3e3",
            "width": "100%",
            "align": "right",
            "height": 22,
            "borderRadius": [4, 4, 0, 0],
        },
        "hr": {
            "borderColor": "#aaa",
            "width": "100%",
            "borderWidth": 0.5,
            "height": 0,
        },
        "b": {"fontSize": 16, "lineHeight": 33},
        "per": {
            "color": "#eee",
            "backgroundColor": "#334455",
            "padding": [2, 4],
            "borderRadius": 2,
        },
    },
)

In [6]:
# 分析参加调查的学生的年级分布（输出柱状图）
res = [0 for _ in range(12)]
for _, grade in data[1].items():
    res[grade-1] = res[grade-1] + 1

bar = (
    Bar()
    .add_xaxis(["一年级", "二年级", "三年级", "四年级", "五年级", "六年级", "初一", "初二", "初三", "高一", "高二", "高三"])
    .add_yaxis("问卷人数", res)
    .set_global_opts(
        title_opts=opts.TitleOpts(title="学生的年级分布"),
        legend_opts=opts.LegendOpts(is_show=True)
    )
)
bar.render_notebook()

In [7]:
# 统计学生使用设备情况
keys = ['电视', '台式电脑', '平板', '手机', '音频', '纸质学习资料']
res = [data[idx].value_counts()[1] for idx in range(2,8)]
res = numpy.array(res)
res = res / res.sum()
res = [list(x) for x in zip(keys, res)]

pie = (
    Pie()
    .add(
        "",
        res,
        radius=["40%", "55%"],
        label_opts=PIE_SETTINGS
    )
    .set_global_opts(title_opts=opts.TitleOpts(title="学生上课设备使用情况统计"))
)
pie.render_notebook()

In [8]:
# 统计平台功能使用情况
keys = ['回看课程视频', '作业提交', '随堂测试', '视频会议', '作业批改反馈', '课堂发言', '班级通知', '班级圈', '优秀作业查看', '学科竞赛游戏', '屏幕共享', '弹幕', '讨论']
res = [int(data[idx].value_counts()[1]) for idx in range(8,20)]

bar = (
    Bar()
    .add_xaxis(keys)
    .add_yaxis('使用人数', res)
    .set_global_opts(
        xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=30)),
        title_opts=opts.TitleOpts(title="平台功能使用情况"),
        legend_opts=opts.LegendOpts(is_show=True)
    )
)

bar.render_notebook()

In [9]:
# 统计上课时长
keys = ['20分钟', '20～30分钟', '30～45分钟', '45分钟以上']
res = [data[21].value_counts()[idx] for idx in range(1,5)]
res = numpy.array(res)
res = res / res.sum()
res = [list(x) for x in zip(keys, res)]

pie = (
    Pie()
    .add(
        "",
        res,
        radius=["40%", "55%"],
        label_opts=PIE_SETTINGS
    )
    .set_global_opts(title_opts=opts.TitleOpts(title="学生上课时长情况统计"))
)
pie.render_notebook()