In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pymongo import MongoClient
from pandas.io.json import json_normalize

plt.style.use('ggplot')
from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['SimHei']  #解决seaborn中文字体显示问题
plt.rc('figure', figsize=(10, 10))  #把plt默认的图片size调大一点
plt.rcParams["figure.dpi"] =mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题
%matplotlib inline

In [2]:
conn = MongoClient(host='127.0.0.1', port=27017)  # 实例化MongoClient
db = conn.get_database('CaiXuKun')  # 连接到CaiXuKun数据库

repost = db.get_collection('repost') # 连接到集合repost
mon_data = repost.find()  # 查询这个集合下的所有记录

In [3]:
data = json_normalize([comment for comment in mon_data])

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102313 entries, 0 to 102312
Data columns (total 97 columns):
_id                                  102313 non-null object
attitudes_count                      102313 non-null int64
bid                                  102313 non-null object
can_edit                             102313 non-null bool
cardid                               1248 non-null object
comments_count                       102313 non-null int64
content_auth                         102313 non-null int64
created_at                           102313 non-null object
darwin_tags                          102313 non-null object
edit_at                              10 non-null object
edit_count                           10 non-null float64
favorited                            102313 non-null bool
hide_flag                            102313 non-null int64
id                                   102313 non-null object
isLongText                           102313 non-null bool
is_impor

In [6]:
data.sample(5)

Unnamed: 0,_id,attitudes_count,bid,can_edit,cardid,comments_count,content_auth,created_at,darwin_tags,edit_at,...,user.mbtype,user.profile_image_url,user.profile_url,user.screen_name,user.statuses_count,user.urank,user.verified,user.verified_reason,user.verified_type,user.verified_type_ext
76462,5c85cc1af2766b0bd3eac927,0,HknmbuU4Q,False,,0,0,3小时前,[],,...,0,https://tvax4.sinaimg.cn/crop.0.0.640.640.180/...,https://m.weibo.cn/u/7012413636?uid=7012413636,困困的菜ICV124,33,4,False,,-1,
8855,5c84a991f2766b0bd3e75385,0,HkgoNlxW5,False,,0,0,52分钟前,[],,...,0,https://tvax3.sinaimg.cn/crop.0.0.640.640.180/...,https://m.weibo.cn/u/7012480301?uid=7012480301,雨露奎哥tCs622,19,1,False,,-1,
78023,5c85d1c1f2766b0bd3eadd3a,0,HkoAu3MmB,False,,0,0,13分钟前,[],,...,0,https://tvax1.sinaimg.cn/crop.0.0.640.640.180/...,https://m.weibo.cn/u/7012731691?uid=7012731691,最酷的坤XLL749,27,3,False,,-1,
88036,5c85f7a0f2766b0bd3eb4fb1,0,HkphJ57ZE,False,,0,0,1小时前,[],,...,0,https://tvax2.sinaimg.cn/crop.0.0.640.640.180/...,https://m.weibo.cn/u/7017930638?uid=7017930638,超赞坤哥cVR094,24,3,False,,-1,
73676,5c85c47ef2766b0bd3eaab8a,0,HkohTnpCI,False,,0,0,2小时前,[],,...,0,https://tvax1.sinaimg.cn/crop.0.0.1080.1080.18...,https://m.weibo.cn/u/6673451463?uid=6673451463,坤的白菜丝,32,9,False,,-1,


#### 1. 数据清洗
由于数据入库的时候没有进行清洗，所以数据多出了很多没用的字段，需要先清洗掉

In [7]:
data.columns

Index(['_id', 'attitudes_count', 'bid', 'can_edit', 'cardid', 'comments_count',
       'content_auth', 'created_at', 'darwin_tags', 'edit_at', 'edit_count',
       'favorited', 'hide_flag', 'id', 'isLongText', 'is_imported_topic',
       'is_paid', 'mblog_vip_type', 'mblogtype', 'mid', 'more_info_type',
       'pending_approval_count', 'pic_ids', 'pic_types', 'pid', 'raw_text',
       'reposts_count', 'reward_exhibition_type', 'show_additional_indication',
       'source', 'sync_mblog', 'topic_id', 'user.avatar_hd',
       'user.badge.anniversary', 'user.badge.asiad_2018',
       'user.badge.bind_taobao', 'user.badge.cz_wed_2017', 'user.badge.dailv',
       'user.badge.dailv_2018', 'user.badge.denglong_2019',
       'user.badge.double11_2018', 'user.badge.dzwbqlx_2016',
       'user.badge.follow_whitelist_video', 'user.badge.fools_day_2016',
       'user.badge.fu_2019', 'user.badge.gongyi', 'user.badge.gongyi_level',
       'user.badge.hongbaofei_2019', 'user.badge.kpl_2018',
       'u

In [64]:
in_columns = ['attitudes_count', 'comments_count', 'reposts_count', 'mid', 'raw_text', 
          'source', 'user.description', 'user.follow_count', 'user.followers_count', 
          'user.gender', 'user.id', 'user.mbrank', 'user.mbtype', 'user.profile_url', 
          'user.profile_image_url', 'user.screen_name', 'user.statuses_count', 
          'user.urank', 'user.verified', 'user.verified_reason']

In [65]:
data = data[in_columns]

In [66]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102313 entries, 0 to 102312
Data columns (total 20 columns):
attitudes_count           102313 non-null int64
comments_count            102313 non-null int64
reposts_count             102313 non-null int64
mid                       102313 non-null object
raw_text                  102313 non-null object
source                    102313 non-null object
user.description          102313 non-null object
user.follow_count         102313 non-null int64
user.followers_count      102313 non-null int64
user.gender               102313 non-null object
user.id                   102313 non-null int64
user.mbrank               102313 non-null int64
user.mbtype               102313 non-null int64
user.profile_url          102313 non-null object
user.profile_image_url    102313 non-null object
user.screen_name          102313 non-null object
user.statuses_count       102313 non-null int64
user.urank                102313 non-null int64
user.verified    

In [92]:
data.to_csv('caixukun.csv', index=False)

问题：
1. 蔡徐坤的微博转发是否存在假流量？
2. 真假流量所占的比例各有多少？
3. 假流量粉丝是如何生产出来的？
4. 真流量粉的粉丝画像

### 1. 蔡徐坤的微博转发是否存在假流量？

In [115]:
# 先来看看蔡徐坤的粉丝性别比例
fans_num = data['user.gender'].value_counts()
fans_num

m    93618
f     8695
Name: user.gender, dtype: int64

In [162]:
from pyecharts import Bar

bar = Bar("蔡徐坤粉丝性别比例初探", width = 600,height=500)
bar.add("(总数据102313条)", ['男', '女'], fans_num.values, is_stack=True, 
       xaxis_label_textsize=20, yaxis_label_textsize=14, is_label_show=True)
bar

In [118]:
np.round(fans_num/fans_num.sum()*100, 2)

m    91.5
f     8.5
Name: user.gender, dtype: float64

In [120]:
data[data['user.gender']=='m'].sample(5)

Unnamed: 0,attitudes_count,comments_count,reposts_count,mid,raw_text,source,user.description,user.follow_count,user.followers_count,user.gender,user.id,user.mbrank,user.mbtype,user.profile_url,user.profile_image_url,user.screen_name,user.statuses_count,user.urank,user.verified,user.verified_reason
2270,0,0,0,4348038635531052,卷毛帅哥的自拍简直是太帅啦！//@Elvirababe-:再见啦千千//@AK47-HIAH...,红米Redmi,,0,1,m,7020364228,0,0,https://m.weibo.cn/u/7020364228?uid=7020364228,https://tvax4.sinaimg.cn/crop.160.0.640.640.18...,快乐追坤Z96406,30,2,False,
14667,0,0,0,4348319830485901,"Even anticipate discrete, I met the other thei...",Android,,0,1,m,6980837370,0,0,https://m.weibo.cn/u/6980837370?uid=6980837370,https://tvax1.sinaimg.cn/crop.0.65.169.169.180...,结愁肠百QfQ953,201,4,False,
91271,0,0,0,4348662274153156,善良的人，善良的心//@石头打瞌睡:#东方风云榜让世界看见蔡徐坤# [喵喵] #蔡徐坤的未...,Android,,0,1,m,7011848763,0,0,https://m.weibo.cn/u/7011848763?uid=7011848763,https://tvax3.sinaimg.cn/crop.0.0.640.640.180/...,AK战士phX674,60,3,False,
24223,0,0,0,4348391876258523,[吃瓜]//@蔡老板的心尖宠:#蔡徐坤[超话]#|#蔡徐坤的未完成# 用真心呵护小动物，感...,Android,,0,1,m,6827212466,0,0,https://m.weibo.cn/u/6827212466?uid=6827212466,https://tvax3.sinaimg.cn/default/images/defaul...,用户6827212466,243,4,False,
24765,0,0,0,4348374163394192,Four,Android,,0,1,m,7012476563,0,0,https://m.weibo.cn/u/7012476563?uid=7012476563,https://tvax2.sinaimg.cn/crop.0.10.640.640.180...,小坤的花3MB514,31,4,False,


### 2. 真假流量所占的比例各有多少？

In [216]:
data_fake = data[((data['user.follow_count']<=5)|(data['user.followers_count']<=5))&
                 (data['user.description']=='')&
                 (data['comments_count']==0)&
                (data['attitudes_count']==0)&
                (data['reposts_count']==0)&
                (data['user.mbrank']==0)]
data_fake.sample(5)

Unnamed: 0,attitudes_count,comments_count,reposts_count,mid,raw_text,source,user.description,user.follow_count,user.followers_count,user.gender,user.id,user.mbrank,user.mbtype,user.profile_url,user.profile_image_url,user.screen_name,user.statuses_count,user.urank,user.verified,user.verified_reason
85984,0,0,0,4348635833440929,遇见你真好 好喜欢你呀,Android,,0,1,m,7017942798,0,0,https://m.weibo.cn/u/7017942798?uid=7017942798,https://tvax3.sinaimg.cn/crop.0.0.640.640.180/...,守护坤宝rGX399,13,2,False,
14659,0,0,0,4348319851483539,My eyes have seen and my ears have heard. ...,Android,,0,1,m,6974693897,0,0,https://m.weibo.cn/u/6974693897?uid=6974693897,https://tvax3.sinaimg.cn/crop.3.0.94.94.180/00...,花花世界cu1087,51,4,False,
10218,0,0,0,4348296446870440,With the wonder of your love， the sun above al...,Android,,0,1,m,7011842865,0,0,https://m.weibo.cn/u/7011842865?uid=7011842865,https://tvax4.sinaimg.cn/crop.0.0.640.640.180/...,绽放光芒ofM731,33,4,False,
78725,0,0,0,4348610562398099,身为偶像，以身作则。,Android,,0,1,m,6941108958,0,0,https://m.weibo.cn/u/6941108958?uid=6941108958,https://tvax1.sinaimg.cn/crop.0.0.690.690.180/...,葵妹威武36F539,64,3,False,
55694,0,0,0,4348399592999505,"Distance, //@小葵花迷糊:带着我们的爱好好长大吧[米奇比心]蔡徐坤",三星Galaxy NOTE III,,0,1,m,6940713794,0,0,https://m.weibo.cn/u/6940713794?uid=6940713794,https://tvax3.sinaimg.cn/crop.0.0.446.446.180/...,AK突突9TM962,17,3,False,


In [217]:
data_fake.shape

(95326, 20)

In [218]:
# 昵称里包含“用户”的，基本上可以断定是假粉丝
data_fake2_index = data[(data['user.follow_count']>5)&
                        (data['user.followers_count']>5)&
                        (data['user.screen_name'].str.contains('用户'))].index

In [220]:
# 把假的流量粉丝转发组合起来
data_fake = pd.concat([data_fake, data.iloc[data_fake2_index]])

In [221]:
data_fake.shape

(95397, 20)

In [222]:
# 取出真粉的转发
data_true = data.drop(data_fake.index)

In [223]:
data_true.shape

(6916, 20)

In [224]:
print('真粉丝转发数占总转发数的{}%'.format(np.round(data_true.shape[0]/data.shape[0]*100, 2)))
print('假粉丝转发数占总转发数的{}%'.format(np.round(data_fake.shape[0]/data.shape[0]*100, 2)))

真粉丝转发数占总转发数的6.76%
假粉丝转发数占总转发数的93.24%


In [225]:
bar = Bar("蔡徐坤真假流量的转发量", width = 600,height=500)
bar.add("(总数据102313条)", ['总转发量', '假粉丝转发量', '真粉丝转发量'], 
        [data.shape[0], data_fake.shape[0], data_true.shape[0]], is_stack=True, 
       xaxis_label_textsize=20, yaxis_label_textsize=14, is_label_show=True)
bar

In [226]:
real_fans_num = data_true.drop_duplicates(subset='user.id').shape[0]

In [227]:
bar = Bar("蔡徐坤真假流量的转发量与真实转发粉丝量(总数据102313条)", width = 600,height=500)
bar.add('', ['总转发量', '假粉丝转发量', '真粉丝转发量', '真实转发粉丝量'], 
        [data.shape[0], data_fake.shape[0], data_true.shape[0], real_fans_num], is_stack=True, 
       xaxis_label_textsize=20, yaxis_label_textsize=14, is_label_show=True, xaxis_rotate=20)
bar

In [228]:
print('真实转发粉丝量占总转发数的{}%'.format(np.round(real_fans_num/data.shape[0]*100, 2)))

真实转发粉丝量占总转发数的3.84%


-----------------吴青峰微博数据做对比-----------------

In [193]:
db = conn.get_database('WuQingFeng')  # WuQingFeng

repost = db.get_collection('repost') # repost
mon_data = repost.find()  # 查询这个集合下的所有记录

In [194]:
wqf_data = json_normalize([comment for comment in mon_data])

In [195]:
wqf_data = wqf_data[in_columns]

In [196]:
wqf_data.shape

(10006, 20)

In [229]:
wqf_data_fake = wqf_data[((wqf_data['user.follow_count']<=5)|(wqf_data['user.followers_count']<=5))&
                         (wqf_data['user.description']=='')&
                         (wqf_data['comments_count']==0)&
                         (wqf_data['attitudes_count']==0)&
                         (wqf_data['reposts_count']==0)&
                         (wqf_data['user.mbrank']==0)]

wqf_data_fake2_index = wqf_data[(wqf_data['user.follow_count']>5)&
                                (wqf_data['user.followers_count']>5)&
                                (wqf_data['user.screen_name'].str.contains('用户'))].index
wqf_data_fake = pd.concat([wqf_data_fake, wqf_data.iloc[wqf_data_fake2_index]])
wqf_data_true = wqf_data.drop(wqf_data_fake.index)

In [230]:
print('吴青峰真粉丝转发数占总转发数的{}%'.format(np.round(wqf_data_true.shape[0]/wqf_data.shape[0]*100, 2)))
print('吴青峰假粉丝转发数占总转发数的{}%'.format(np.round(wqf_data_fake.shape[0]/wqf_data.shape[0]*100, 2)))

吴青峰真粉丝转发数占总转发数的96.52%
吴青峰假粉丝转发数占总转发数的3.48%


In [231]:
bar = Bar("吴青峰真假流量的转发量", width = 600,height=500)
bar.add("(总数据10006条)", ['总转发量', '假粉丝转发量', '真粉丝转发量'], 
        [wqf_data.shape[0], wqf_data_fake.shape[0], wqf_data_true.shape[0]], is_stack=True,
        xaxis_label_textsize=20, yaxis_label_textsize=14, is_label_show=True)
bar

In [232]:
wqf_real_fans_num = wqf_data_true.drop_duplicates(subset='user.id').shape[0]

bar = Bar("吴青峰真假流量的转发量与真实转发粉丝量(总数据10006条)", width = 600,height=500)
bar.add('', ['总转发量', '假粉丝转发量', '真粉丝转发量', '真实转发粉丝量'], 
        [wqf_data.shape[0], wqf_data_fake.shape[0], wqf_data_true.shape[0], 
         wqf_real_fans_num], is_stack=True, 
        xaxis_label_textsize=20, yaxis_label_textsize=14, is_label_show=True, xaxis_rotate=20)
bar

In [237]:
wqf_data.sample(5)

Unnamed: 0,attitudes_count,comments_count,reposts_count,mid,raw_text,source,user.description,user.follow_count,user.followers_count,user.gender,user.id,user.mbrank,user.mbtype,user.profile_url,user.profile_image_url,user.screen_name,user.statuses_count,user.urank,user.verified,user.verified_reason
6149,1,0,0,4347751288499206,爱母亲一生一世,红米Redmi,,32,31,m,5676300325,0,0,https://m.weibo.cn/u/5676300325?uid=5676300325,https://tvax1.sinaimg.cn/crop.0.0.996.996.180/...,只抓猪猪打,4,4,False,
3805,23,4,11,4347895002290957,哭//@囤仔:今日催泪弹,iPhone客户端,公主号：饭饭哒 混干皮，会写功课会发壁纸ヾ(･ω･*)ﾉ,435,717510,f,2273529342,6,12,https://m.weibo.cn/u/2273529342?uid=2273529342,https://tvax1.sinaimg.cn/crop.11.0.728.728.180...,饭饭饭饭哒,3380,47,False,
6141,0,0,0,4347750722696051,转发微博,iPhone客户端,诗酒趁年华.,458,142,f,5846588842,0,0,https://m.weibo.cn/u/5846588842?uid=5846588842,https://tvax4.sinaimg.cn/crop.0.0.640.640.180/...,阿羽想当锦鲤大王,1350,9,False,
760,0,0,0,4348436402625735,今日的晚安曲，晚安。[心]@吳青峰,vivo AI智慧拍照X21,诸行无常，初心不易。,173,17,f,7026562408,0,0,https://m.weibo.cn/u/7026562408?uid=7026562408,https://tvax3.sinaimg.cn/crop.0.0.996.996.180/...,Star的一纸情书,3,4,False,
6493,0,0,0,4347738961856732,我们就这样 各自奔天涯,二月🐑iPhone XS Max,你是我小心维护的梦,255,108,f,2055723847,6,12,https://m.weibo.cn/u/2055723847?uid=2055723847,https://tvax4.sinaimg.cn/crop.0.0.1080.1080.18...,Surisuria,10184,47,False,


In [239]:
data.sample(5)

Unnamed: 0,attitudes_count,comments_count,reposts_count,mid,raw_text,source,user.description,user.follow_count,user.followers_count,user.gender,user.id,user.mbrank,user.mbtype,user.profile_url,user.profile_image_url,user.screen_name,user.statuses_count,user.urank,user.verified,user.verified_reason
78093,0,0,0,4348585275987130,"dove sei passata,",Flyme,,0,1,m,7011819483,0,0,https://m.weibo.cn/u/7011819483?uid=7011819483,https://tvax4.sinaimg.cn/crop.0.0.640.640.180/...,思念坤坤rWM833,67,3,False,
48412,0,0,0,4348425962842699,[嘻嘻]//@KUN的小喵咪:#东方风云榜让世界看见蔡徐坤#,Android,,1,1,f,6589900139,0,0,https://m.weibo.cn/u/6589900139?uid=6589900139,https://tvax1.sinaimg.cn/crop.0.0.100.100.180/...,偎率把啦s,568,4,False,
47984,0,0,0,4348404173745759,从现在开始努力，一切都来得及,Android,,0,1,m,7010929412,0,0,https://m.weibo.cn/u/7010929412?uid=7010929412,https://tvax4.sinaimg.cn/crop.0.0.99.99.180/00...,音乐才子asS736,29,4,False,
88312,0,0,0,4348631156993811,When you leave I'm begging you not to go.蔡徐坤 @蔡徐坤,HUAWEI P10,,60,1,m,6877062416,0,0,https://m.weibo.cn/u/6877062416?uid=6877062416,https://tvax3.sinaimg.cn/crop.0.0.100.100.180/...,蔡小葵_cya56,189,4,False,
17507,0,0,0,4348333834721683,#东方风云榜让世界看见蔡徐坤#It’s not about the salary,Android,,61,1,m,6862227587,0,0,https://m.weibo.cn/u/6862227587?uid=6862227587,https://tvax2.sinaimg.cn/crop.0.0.100.100.180/...,蔡小葵_cvr19,36,4,False,


### 3. 假流量粉丝是如何生产出来的？

In [246]:
data_fake_gender = data_fake.drop_duplicates(subset='user.id')['user.gender'].value_counts()
data_fake_gender

m    38969
f     1869
Name: user.gender, dtype: int64

In [244]:
data_fake[data_fake['user.gender']=='f'].sample(5)

Unnamed: 0,attitudes_count,comments_count,reposts_count,mid,raw_text,source,user.description,user.follow_count,user.followers_count,user.gender,user.id,user.mbrank,user.mbtype,user.profile_url,user.profile_image_url,user.screen_name,user.statuses_count,user.urank,user.verified,user.verified_reason
64180,0,0,0,4348440278763521,I just wanna talk to u don't be afraid//@i坤555...,Android,,0,1,f,6730864661,0,0,https://m.weibo.cn/u/6730864661?uid=6730864661,https://tvax3.sinaimg.cn/crop.0.0.100.100.180/...,小葵花籽_包,175,4,False,
79757,0,0,0,4348610688763958,转发微博,华为手机 畅享玩不停,,61,1,f,6791332699,0,0,https://m.weibo.cn/u/6791332699?uid=6791332699,https://tvax2.sinaimg.cn/crop.0.0.131.131.180/...,野的像_狗,22,4,False,
65164,0,0,0,4348430597589392,You scratch my back and I'll scratch yours.//@...,Android,,53,1,f,6805019442,0,0,https://m.weibo.cn/u/6805019442?uid=6805019442,https://tvax1.sinaimg.cn/crop.0.0.180.180.180/...,xx__xmmt,349,4,False,
75228,0,0,0,4348576912201314,[好喜欢]//@蔡徐坤内人:[心][心]//@坤坤的公主群:#东方风云榜让世界看见蔡徐坤# ...,Android,,0,1,f,6619935138,0,0,https://m.weibo.cn/u/6619935138?uid=6619935138,https://tvax3.sinaimg.cn/default/images/defaul...,用户6619935138,255,4,False,
87263,0,0,0,4348651100519444,蔡徐坤你的美貌在我心里收藏。#东方风云榜让世界看见蔡徐坤#好喜欢呀@蔡徐坤,前置双摄vivo X9s,,59,2,f,6853156261,0,0,https://m.weibo.cn/u/6853156261?uid=6853156261,https://tvax2.sinaimg.cn/crop.0.0.179.179.180/...,我和_君莫笑晕在厕所,40,4,False,


In [251]:
bar = Bar("蔡徐坤假粉丝性别比例", width = 600,height=500)
bar.add("(假粉丝总数为40838)", ['男', '女'], data_fake_gender.values, is_stack=True, 
       xaxis_label_textsize=20, yaxis_label_textsize=14, is_label_show=True)
bar

In [312]:
38969/40838

0.954233801851217

In [252]:
data_fake['raw_text'].value_counts()

转发微博                                                                                                                                         429
I am only waiting for love to give myself up at last into his hands.                                                                         375
想你//@蔡徐坤的南岸末阴大小姐:#东方风云榜让世界看见蔡徐坤# /#蔡徐坤的未完成#祝千千在新家能快快乐乐 健健康康的@蔡徐坤                                                                             289
我心悦你//@蔡徐坤的南岸末阴大小姐:#东方风云榜让世界看见蔡徐坤# /#蔡徐坤的未完成#祝千千在新家能快快乐乐 健健康康的@蔡徐坤                                                                           288
爱你//@蔡徐坤的南岸末阴大小姐:#东方风云榜让世界看见蔡徐坤# /#蔡徐坤的未完成#祝千千在新家能快快乐乐 健健康康的@蔡徐坤                                                                             278
花花世界静守己心蔡徐坤未来可期！//@超超超超爱蔡蔡的思思:[爱你]                                                                                                           249
As much as I should                                                                                                               

In [255]:
fake_source = data_fake['source'].value_counts()[:10]

In [265]:
bar = Bar("蔡徐坤假粉丝Top10转发设备", width = 600,height=600)
bar.add("", fake_source.index, fake_source.values, is_stack=True, 
       xaxis_label_textsize=11, yaxis_label_textsize=14, is_label_show=True, xaxis_rotate=30)
bar

In [267]:
data_fake['user.follow_count'].mean()

3.4412612555950397

In [269]:
data_fake['user.followers_count'].mean()

1.04576663836389

In [275]:
data_fake_sample = data_fake.sample(5)

In [276]:
data_fake_sample['user.screen_name']

21846       蓬蓬坤8Bd528
80516       蓝玫瑰jov890
55689    用户6994230787
37178      从不认输pmb498
11486      纵骋横驰UcL978
Name: user.screen_name, dtype: object

In [277]:
data_fake_sample['user.profile_image_url'].values

array(['https://tvax3.sinaimg.cn/crop.0.0.640.640.180/007ExdLSly8g0kfgzq276j30hs0hsq4h.jpg',
       'https://tvax2.sinaimg.cn/crop.0.0.640.640.180/007Ezlmqly8g0kbtdsc32j30ht0hsdha.jpg',
       'https://tvax2.sinaimg.cn/crop.0.0.200.200.180/007Dl4VZly8g04u0faipsj305k05kjrg.jpg',
       'https://tvax4.sinaimg.cn/crop.0.0.640.640.180/007EEc68ly8g0l9fjqh0xj30hs0hs0tr.jpg',
       'https://tvax2.sinaimg.cn/crop.79.0.188.188.180/007CtWGgly8fzqpzemlkzj309m058dfq.jpg'],
      dtype=object)

In [281]:
data_fake.sample(5)['user.screen_name']

9413       坤色坤香gxu584
3347        慈祥纽_tdp10
15825    用户6503593711
28358       怀遇不n4D084
96873      坤也可爱wuv340
Name: user.screen_name, dtype: object

In [279]:
data_fake['user.screen_name'].str.contains('蔡|坤|葵|kun').sum()

41766

In [280]:
data_fake.shape[0]

95397

In [283]:
data_fake['user.statuses_count'].mean()

72.4942503433022

### 4. 真流量粉的粉丝画像

In [285]:
data_true.sample(5)

Unnamed: 0,attitudes_count,comments_count,reposts_count,mid,raw_text,source,user.description,user.follow_count,user.followers_count,user.gender,user.id,user.mbrank,user.mbtype,user.profile_url,user.profile_image_url,user.screen_name,user.statuses_count,user.urank,user.verified,user.verified_reason
76048,0,0,0,4348565809642641,#东方风云榜让世界看见蔡徐坤# 遇见你的那天，我就没想过要分开。@蔡徐坤,HUAWEI Mate 10,小号轮博，互粉呀，坤坤(◍ ´꒳` ◍),251,291,f,6505180919,0,0,https://m.weibo.cn/u/6505180919?uid=6505180919,https://tvax1.sinaimg.cn/crop.0.0.996.996.180/...,葵葵吃芒果冰呀,13518,19,False,
68749,0,0,0,4348485031882927,想你了,Android,我与你隔着长风深谷 近不得 退不舍 ​​,291,66,f,6093210679,3,12,https://m.weibo.cn/u/6093210679?uid=6093210679,https://tvax2.sinaimg.cn/crop.0.0.996.996.180/...,善良小菜最好命,15986,14,False,
2162,0,0,0,4347997343648508,转发微博,vivo智能手机,,655,97,f,6253839509,0,0,https://m.weibo.cn/u/6253839509?uid=6253839509,https://tvax1.sinaimg.cn/crop.0.0.996.996.180/...,蔡徐坤ikun24298586,1280,12,False,
42740,0,0,0,4348419025500369,@蔡徐坤 我永远支持你！我们一起拿下 #明星势力榜# 第一名！,明星势力榜,,555,72,f,6575015283,0,0,https://m.weibo.cn/u/6575015283?uid=6575015283,https://tvax4.sinaimg.cn/crop.0.0.996.996.180/...,女王范的成全16,274,9,False,
77618,0,0,0,4348610940850719,#东方风云榜让世界看见蔡徐坤#@蔡徐坤 我永远支持你！我们一起拿下 #明星势力榜# 第一名！,明星势力榜,,97,33,f,5635145902,0,0,https://m.weibo.cn/u/5635145902?uid=5635145902,https://tvax1.sinaimg.cn/crop.0.0.996.996.180/...,坤坤的小咪喵,2548,4,False,


In [287]:
data_true_gender = data_true.drop_duplicates(subset='user.id')['user.gender'].value_counts()
data_true_gender

f    3287
m     639
Name: user.gender, dtype: int64

In [291]:
bar = Bar("蔡徐坤真粉丝性别比例", width = 600,height=500)
bar.add("(真粉丝总数为3926)", ['女', '男'], data_true_gender.values, is_stack=True, 
       xaxis_label_textsize=20, yaxis_label_textsize=14, is_label_show=True)
bar

In [293]:
data_true['raw_text'].value_counts()

转发微博                                                                                                                                                                    1045
@蔡徐坤 我永远支持你！我们一起拿下 #明星势力榜# 第一名！                                                                                                                                          622
#东方风云榜让世界看见蔡徐坤#                                                                                                                                                           73
@蔡徐坤  我在#明星ALL榜[超话]#上为你加油啦，你是我今生唯一的执著哦。#蔡徐坤[超话]# 棒棒哒！快来为TA应援吧                                                                                                             50
//@蔡徐坤工作室:#蔡徐坤[超话]#[给你小心心]#蔡徐坤的未完成#之宠物医院终于步入尾声，在短暂的相处时间里，因为工作忙碌无法养育小动物的@蔡徐坤 也获得了片刻的慰藉，感谢千千给我们带来了一段难忘的回忆[心]#蔡徐坤 ONE#                                                       42
#东方风云榜让世界看见蔡徐坤# [心]#蔡徐坤的未完成# [太开心]//@August-小漂亮的万花筒:#东方风云榜让世界看见蔡徐坤# [心]#蔡徐坤的未完成# 哈哈哈哈哈哈，宝贝让千千转圈圈太可爱了[笑哈哈][笑哈哈][笑哈哈]                    

In [294]:
true_source = data_true['source'].value_counts()[:10]

In [295]:
bar = Bar("蔡徐坤真粉丝Top10转发设备", width = 600,height=600)
bar.add("", true_source.index, true_source.values, is_stack=True, 
       xaxis_label_textsize=11, yaxis_label_textsize=14, is_label_show=True, xaxis_rotate=30)
bar

In [296]:
data_true['user.follow_count'].mean()

222.0597165991903

In [298]:
data_true['user.followers_count'].mean()

178.9480913823019

In [302]:
data_true.sample(5)['user.screen_name']

90060    complemehtht_16690
51426                 82号甜七
75569             August菜包包
33191           薄荷般的夏天_你的时代
92003          Amygirl_P的坤坤
Name: user.screen_name, dtype: object

In [303]:
data_true['user.screen_name'].str.contains('蔡|坤|葵|kun').sum()

3153

In [304]:
data_true.shape[0]

6916

In [305]:
# 绘制蔡徐坤真粉丝的简介词云图
import jieba
from collections import Counter
from pyecharts import WordCloud

jieba.add_word('蔡徐坤')

swords = [x.strip() for x in open ('stopwords.txt')]

Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/mc/k6p_zt453w770h63024z__vw0000gn/T/jieba.cache
Loading model cost 1.634 seconds.
Prefix dict has been built succesfully.


In [308]:
def plot_word_cloud(data, swords, columns):
    text = ''.join(data[columns])
    words = list(jieba.cut(text))
    ex_sw_words = []
    for word in words:
        if len(word)>1 and (word not in swords):
            ex_sw_words.append(word)
    c = Counter()
    c = Counter(ex_sw_words)
    wc_data = pd.DataFrame({'word':list(c.keys()), 'counts':list(c.values())}).sort_values(by='counts', ascending=False).head(100)
    wordcloud = WordCloud(width=1300, height=620)
    wordcloud.add("", wc_data['word'], wc_data['counts'], word_size_range=[20, 100])
    return wordcloud

In [309]:
plot_word_cloud(data=data_true, swords=swords, columns='user.description')

In [310]:
plot_word_cloud(data=data_true, swords=swords, columns='raw_text')