# 载入套件

In [None]:
from typing import Dict
from tqdm.auto import tqdm

import numpy as np
import random
import pandas as pd
import ydata_profiling

import scienceplots
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib

import torch
import os

%matplotlib inline

pd.plotting.register_matplotlib_converters()
sns.set_style("whitegrid")
sns.set_palette("RdBu")
sns.set(
    rc={'text.usetex': True},
    font="serif",
    font_scale=1.2
)

days = {i: pd.read_pickle(f'../Datasets/day{str(i).zfill(2)}.pkl')
        for i in range(1, 22)}


# 工具函数

In [None]:
SEED = 20230723


def same_seed(seed=SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


same_seed()


# 辅助表格


app_class.csv，共两列。第一列是appid,给出4000多个常用APP所属类别（app_class）,比如：社交类、影视类、教育类等，用英文字母a-t表示，共20个常用得所属类别，其余APP不常用，所属类别未知。

In [None]:
# 辅助表格，常用APP类别
# 发现：原始表格存在重复值
app = pd.read_csv('../Datasets/app_class.csv', header=None).drop_duplicates()
app.columns = ['appid', 'app_class']

# 副本：不锁定数据格式
app_copy = app.copy()

# 约束数据格式
app['appid'] = app['appid'].astype('category')
app['app_class'] = app['app_class'].astype('category')

# 打印行列数、概况
print('app:', app.shape)
pd.DataFrame(app.value_counts('app_class'))
app.describe()


In [None]:
# 21天内监测数据中共有36435种app_id
appid = set({})
for i in range(1, 22):
    appid = appid.union(days[i]['appid'].unique())
len(appid)


In [None]:
# NaN
val = pd.DataFrame({'appid': list(appid)})
val = val.merge(app_copy, on='appid', how='left')
val[val['app_class'].isna()].shape


In [None]:
# a～t
val['app_class'].dropna().shape


In [None]:
sns.catplot(kind='count', data=val.sort_values(by='app_class'),
            x='app_class', height=3.5, aspect=10/3.5)

plt.savefig("../Thesis/figures/app_class_countplot_in_days.pdf",
            dpi=400, bbox_inches='tight', pad_inches=0)


In [None]:
pd.DataFrame(val.fillna('NaN').value_counts(
    ['app_class'])).sort_values(by='count')
