In [18]:
from typing import Dict
from tqdm.auto import tqdm

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.cluster import DBSCAN

import numpy as np
import random
import pandas as pd
import ydata_profiling

import scienceplots
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib

import torch
import os

%matplotlib inline

pd.plotting.register_matplotlib_converters()
sns.set_style("whitegrid")
sns.set_palette("RdBu")
sns.set(
    rc={'text.usetex': True},
    font="serif",
    font_scale=1.2
)


# 特征建立

## 无监督学习特征

In [19]:
features = pd.read_pickle('../Datasets/features_q1.pkl')
for aspect in ['duration', 'up_flow', 'down_flow', 'count']:
    cols = [i for i in features.columns if i.startswith(aspect)]
    features[cols] = (features[cols] - features[cols].values.mean()) / \
        features[cols].values.std()

pca = PCA(n_components=3)
pca.fit(features.drop(columns='uid'))
transformed = pca.transform(features.drop(columns='uid'))

model = KMeans(n_clusters=5, random_state=0,
               n_init="auto", max_iter=1000)
labels_kmeans = model.fit_predict(transformed)

user2cluster = {}
for user, cluster in zip(features['uid'].values, labels_kmeans):
    user2cluster[user] = cluster


In [32]:
features_user2cluster = pd.DataFrame({
    'uid': user2cluster.keys(),
    'cluster': user2cluster.values()
})


## 监测数据特征

In [20]:
days = {i: pd.read_pickle(f'../Datasets/day{str(i).zfill(2)}.pkl')
        for i in range(1, 22)}


In [27]:
dfs = []
for i in range(1, 22):
    pt = pd.DataFrame(pd.DataFrame(days[i].pivot_table(
        index='uid',
        values='duration',
        columns='app_class',
        aggfunc=np.sum
    )['a']).to_records())
    pt['day'] = i
    dfs.append(pt)

features_q2 = pd.DataFrame(pd.concat(dfs, ignore_index=True).pivot_table(
    index='uid',
    values='a',
    columns='day',
).fillna(0).to_records())


In [37]:
features_q2 = features_q2.merge(
    features_user2cluster, on='uid', how='left').fillna(0)


In [39]:
features_q2['cluster'] = features_q2['cluster'].astype('category')


In [41]:
features_q2.to_pickle('../Datasets/features_q2.pkl')
