In [1]:
import time
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('data/uid_voice_label.csv')

In [3]:
df.head()

Unnamed: 0,uid,related_uid,scores,uid_label,related_label
0,16430050,24612414,31.14,1,1
1,21660333,709410,40.05,1,1
2,61502147,24952341,47.4,1,1
3,24381202,24590001,36.08,1,1
4,24775663,24667703,31.62,1,1


In [8]:
def splitUser(data,good_label=0):
    """
    划分好坏用户集合
    param:
        data:约定好格式的文件
        good_label:好用户label
    return:
        good_user
        bad_user
    """
    bad_user = set(df.loc[df.uid_label!=good_label,'uid'].values)|set(set(df.loc[df.related_label!=good_label,'related_uid'].values))
    good_user = set(df.loc[df.uid_label==good_label,'uid'].values)|set(set(df.loc[df.related_label==good_label,'related_uid'].values))
    return good_user,bad_user

In [13]:
def buildNetworkx(data,min_score=30):
    """
    建立网络
    param:
        data:约定好格式的文件
        min_socre:最低相似分
    return:
        dg
    """
    dg = nx.DiGraph()
    for x in data.itertuples():
        if x.scores<min_score:
            continue
        dg.add_weighted_edges_from([(x.uid,x.related_uid,x.scores)])
    return dg

In [14]:
bad_user,good_user = splitUser(df,good_label=1)
dg=buildNetworkx(df)

In [16]:
def comDegree(dg,bad_user,good_user):
    """
    计算度的分布
    param:
        dg:图
        bad_user:坏用户uid
        good_user:好用户uid
    return 
        dataFrame
    """
    out_degree_list = []
    in_degree_list = []
    degree_list = []
    for uid in good_user:
        out_degree_list.append(dg.out_degree(uid,weight='weight'))
        in_degree_list.append(dg.in_degree(uid,weight='weight'))
        degree_list.append(dg.degree(uid,weight='weight'))
    good_user_min_degree = min(degree_list)
    good_user_max_degree = max(degree_list)
    good_user_avg_degree = np.mean(degree_list)
    good_user_std_degree = np.std(degree_list)
    good_user_min_outDegree = min(out_degree_list)
    good_user_max_outDegree = max(out_degree_list)
    good_user_avg_outDegree = np.mean(out_degree_list)
    good_user_std_outDegree = np.std(out_degree_list)
    good_user_min_inDegree = min(in_degree_list)
    good_user_max_inDegree = max(in_degree_list)
    good_user_avg_inDegree = np.mean(in_degree_list)
    good_user_std_inDegree = np.std(in_degree_list)
    #bad
    out_degree_list = []
    in_degree_list = []
    degree_list = []
    for uid in bad_user:
        out_degree_list.append(dg.out_degree(uid,weight='weight'))
        in_degree_list.append(dg.in_degree(uid,weight='weight'))
        degree_list.append(dg.degree(uid,weight='weight'))
    bad_user_min_degree = min(degree_list)
    bad_user_max_degree = max(degree_list)
    bad_user_avg_degree = np.mean(degree_list)
    bad_user_std_degree = np.std(degree_list)
    bad_user_min_outDegree = min(out_degree_list)
    bad_user_max_outDegree = max(out_degree_list)
    bad_user_avg_outDegree = np.mean(out_degree_list)
    bad_user_std_outDegree = np.std(out_degree_list)
    bad_user_min_inDegree = min(in_degree_list)
    bad_user_max_inDegree = max(in_degree_list)
    bad_user_avg_inDegree = np.mean(in_degree_list)
    bad_user_std_inDegree = np.std(in_degree_list)
    
    dt = {'user_type':['good_user','bad_user'],'uid_num':[len(good_user),len(bad_user)],
          'min_degree':[good_user_min_degree,bad_user_min_degree],
          'max_degree':[good_user_max_degree,bad_user_max_degree],
          'avg_degree':[good_user_avg_degree,bad_user_avg_degree],
          'std_degree':[good_user_std_degree,bad_user_std_degree],
          'min_outDegree':[good_user_min_outDegree,bad_user_min_outDegree],
          'max_outDegree':[good_user_max_outDegree,bad_user_max_outDegree],
          'avg_outDegree':[good_user_avg_outDegree,bad_user_avg_outDegree],
          'std_outDegree':[good_user_std_outDegree,bad_user_std_outDegree],
          'min_inDegree':[good_user_min_inDegree,bad_user_min_inDegree],
          'max_inDegree':[good_user_max_inDegree,bad_user_max_inDegree],
          'avg_inDegree':[good_user_avg_inDegree,bad_user_avg_inDegree],
          'std_inDegree':[good_user_std_inDegree,bad_user_std_inDegree]
         }
    return pd.DataFrame(dt)

In [21]:
tt = comDegree(dg,bad_user,good_user)
tt

Unnamed: 0,user_type,uid_num,min_degree,max_degree,avg_degree,std_degree,min_outDegree,max_outDegree,avg_outDegree,std_outDegree,min_inDegree,max_inDegree,avg_inDegree,std_inDegree
0,good_user,2324,30.0,726.11,44.095873,31.423335,0,117.06,7.033171,16.495567,0,726.11,37.062702,33.782898
1,bad_user,203796,30.0,5024.75,56.109544,97.379592,0,4970.05,28.225994,69.56585,0,2811.07,27.88355,55.174697


In [19]:
def comPathLength(dg,bad_user,good_user):
    """
    计算uid所在图的平均路径长度分布
    param:
        dg:图
        bad_user:坏用户uid
        good_user:好用户uid
    return 
        dataFrame
    """
    #bad_user
    bad_path_lenth=[]
    for uid in bad_user:
        work_path_lenth = []
        edge_set = nx.descendants(dg,uid)
        if len(edge_set)==0:
            bad_path_lenth.append(0)
            continue
        spl=list(nx.single_source_shortest_path_length(dg,uid).values())[1:]
        if len(spl)>0:
            work_path_lenth.append(np.mean(spl))
        for u in edge_set:
            spl=list(nx.single_source_shortest_path_length(dg,u).values())[1:]
            if len(spl)>0:
                work_path_lenth.append(np.mean(spl))
        bad_path_lenth.append(np.mean(work_path_lenth))
    
    #good_user
    good_path_lenth=[]
    for uid in good_user:
        work_path_lenth = []
        edge_set = nx.descendants(dg,uid)
        if len(edge_set)==0:
            good_path_lenth.append(0)
            continue
        spl=list(nx.single_source_shortest_path_length(dg,uid).values())[1:]
        if len(spl)>0:
            work_path_lenth.append(np.mean(spl))
        for u in edge_set:
            spl=list(nx.single_source_shortest_path_length(dg,u).values())[1:]
            if len(spl)>0:
                work_path_lenth.append(np.mean(spl))
        good_path_lenth.append(np.mean(work_path_lenth))
    good_user_min_length = min(good_path_lenth)
    good_user_max_length = max(good_path_lenth)
    good_user_avg_length = np.mean(good_path_lenth)
    good_user_std_length = np.std(good_path_lenth)
    bad_user_min_length = min(bad_path_lenth)
    bad_user_max_length = max(bad_path_lenth)
    bad_user_avg_length = np.mean(bad_path_lenth)
    bad_user_std_length = np.std(bad_path_lenth)
    dt = {'user_type':['good_user','bad_user'],'uid_num':[len(good_user),len(bad_user)],
          'min_length':[good_user_min_length,bad_user_min_length],
          'max_length':[good_user_max_length,bad_user_max_length],
          'avg_length':[good_user_avg_length,bad_user_avg_length],
          'std_length':[good_user_std_length,bad_user_std_length]
         }
    return pd.DataFrame(dt)

In [20]:
tt = comPathLength(dg,bad_user,good_user)
tt

Unnamed: 0,user_type,uid_num,min_length,max_length,avg_length,std_length
0,good_user,2324,0,2.206605,0.177204,0.388457
1,bad_user,203796,0,2.624671,0.546862,0.525628


In [22]:
def comCluster(dg,bad_user,good_user):
    """
    计算uid所在图的聚类系数分布
    param:
        dg:图
        bad_user:坏用户uid
        good_user:好用户uid
    return 
        dataFrame
    """
    good_cluster_list = []
    bad_cluster_list = []
    for uid in good_user:
        good_cluster_list.append(nx.clustering(dg,uid,weight='weight'))
    for uid in bad_user:
        bad_cluster_list.append(nx.clustering(dg,uid,weight='weight'))
        
    good_user_min_cluster = min(good_cluster_list)
    good_user_max_cluster = max(good_cluster_list)
    good_user_avg_cluster = np.mean(good_cluster_list)
    good_user_std_cluster = np.std(good_cluster_list)
    bad_user_min_cluster = min(bad_cluster_list)
    bad_user_max_cluster = max(bad_cluster_list)
    bad_user_avg_cluster = np.mean(bad_cluster_list)
    bad_user_std_cluster = np.std(bad_cluster_list)
    dt = {'user_type':['good_user','bad_user'],'uid_num':[len(good_user),len(bad_user)],
          'min_cluster':[good_user_min_cluster,bad_user_min_cluster],
          'max_cluster':[good_user_max_cluster,bad_user_max_cluster],
          'avg_cluster':[good_user_avg_cluster,bad_user_avg_cluster],
          'std_cluster':[good_user_std_cluster,bad_user_std_cluster]
         }
    return pd.DataFrame(dt)

In [23]:
tt = comCluster(dg,bad_user,good_user)
tt

Unnamed: 0,user_type,uid_num,min_cluster,max_cluster,avg_cluster,std_cluster
0,good_user,2324,0,0.526786,0.009554,0.049271
1,bad_user,203796,0,0.688799,0.016611,0.063429
