# 载入套件

In [None]:
from typing import Dict
import numpy as np
import pandas as pd

import ydata_profiling
import scienceplots
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib

%matplotlib inline

pd.plotting.register_matplotlib_converters()
plt.style.use(['fivethirtyeight', 'science', 'grid'])
sns.set(
    rc={'text.usetex': True},
    font="serif",
    font_scale=1.2
)


In [None]:
days = {i: pd.read_pickle(f'../Datasets/day{str(i).zfill(2)}.pkl')
        for i in range(1, 22)}


# 使用时长

In [None]:
dfs = []
value = 'duration'
for i in range(1, 8):
    pt = days[i].pivot_table(
        index='uid',
        columns='app_class',
        values=value,
        aggfunc=np.sum,
        fill_value=0
    )

    feature = pd.DataFrame(pt.to_records())
    feature.columns = [
        f'{value}_{i}' if i != 'uid' else i for i in feature.columns]
    feature.drop(columns=f'{value}_NaN', inplace=True)
    feature['day'] = i
    dfs.append(feature)

feature_duration = pd.concat(dfs, ignore_index=True).groupby([
    'uid']).sum().drop(columns='day')
feature_duration


# 上行流量

In [None]:
dfs = []
value = 'up_flow'
for i in range(1, 8):
    pt = days[i].pivot_table(
        index='uid',
        columns='app_class',
        values=value,
        aggfunc=np.sum,
        fill_value=0
    )

    feature = pd.DataFrame(pt.to_records())
    feature.columns = [
        f'{value}_{i}' if i != 'uid' else i for i in feature.columns]
    feature.drop(columns=f'{value}_NaN', inplace=True)
    feature['day'] = i
    dfs.append(feature)

feature_upflow = pd.concat(dfs, ignore_index=True).groupby([
    'uid']).sum().drop(columns='day')
feature_upflow


# 下行流量

In [None]:
dfs = []
value = 'down_flow'
for i in range(1, 8):
    pt = days[i].pivot_table(
        index='uid',
        columns='app_class',
        values=value,
        aggfunc=np.sum,
        fill_value=0
    )

    feature = pd.DataFrame(pt.to_records())
    feature.columns = [
        f'{value}_{i}' if i != 'uid' else i for i in feature.columns]
    feature.drop(columns=f'{value}_NaN', inplace=True)
    feature['day'] = i
    dfs.append(feature)

feature_downflow = pd.concat(dfs, ignore_index=True).groupby([
    'uid']).sum().drop(columns='day')
feature_downflow


# 使用频次

dfs = []
value = 'count'
for i in range(1, 8):
    pt = days[i].pivot_table(
        index='uid',
        columns='app_class',
        values='duration',
        aggfunc='count',
        fill_value=0
    )

    feature = pd.DataFrame(pt.to_records())
    feature.columns = [
        f'{value}_{i}' if i != 'uid' else i for i in feature.columns]
    feature.drop(columns=f'{value}_NaN', inplace=True)
    feature['day'] = i
    dfs.append(feature)

feature_count = pd.concat(dfs, ignore_index=True).groupby([
    'uid']).sum().drop(columns='day')
feature_count


# 特征融合

前7日各类 APP 使用时长、使用频次、上行流量、下行流量

In [None]:
features = pd.DataFrame(pd.concat([feature_duration, feature_upflow,
                                   feature_downflow, feature_count], axis=1).to_records())
features.to_pickle('../Datasets/features_q1.pkl')
