from future import division import numpy as np import pandas as pd import matplotlib.pyplot as plt from numpy.random import randn from pandas import Series,DataFrame from datetime import datetime import xlrd,openpyxl
xlsx_file = pd.ExcelFile('/Users/eason1986/Documents/数据科学/excelRun/test1.xlsx')
All = xlsx_file.parse('All')
#删除无用列,保留有用列 d1 = All.drop(All.columns[:11],axis = 1,inplace = False) # colome 的axis = 1 删除0-11列 All = d1.drop(d1.columns[-1],axis = 1,inplace = False) #删除最后一列
head = All.head() #显示前五行 oriLen = len(All) # 未处理重复前表中记录长度
preLen = All[All.duplicated() == True].index[:20]
#删除掉重复的行,在原值上直接改,inplace = True All.drop_duplicates(inplace = True) aftLen = len(All)
All[u'转发数'][All[u'转发数'] == u'转发']= '0' All[u'评论数'][All[u'评论数'] == u'评论'] = '0' All[u'点赞数'][All[u'点赞数'] == u'赞'] = '0' # All[u'点赞数'].replace(u'赞',0)
#数据透视
All[u'转发数'] = All[u'转发数'].astype('int64') All[u'评论数'] = All[u'评论数'].astype('int64') All[u'点赞数'] = All[u'点赞数'].astype('int64')
All.to_excel('All.xlsx',index = False)
#数据透视表
All_pivot = All.pivot_table(values = [u'转发数',u'评论数',u'点赞数',u'微博内容'],index = [u'用户名'],
aggfunc = {u'转发数':np.sum,u'评论数':np.sum,u'点赞数':np.sum,u'微博内容':np.size})
All_pivot.rename(columns = {u'微博内容':u'当月总微博数'},inplace = True)
All_pivot.to_excel('All_pivot.xlsx')
sf = xlsx_file.parse('sf')
sfweibo = xlsx_file.parse('sfweibo')
sf[u'省份前两字'] = np.nan
for i in range(len(sf[u'省份名'])): sf[u'省份前两字'][i] = sf[u'省份名'][i][:2]
sfweibo[u'省份前两字'] = np.nan
for i in range(len(sfweibo[u'省份名'])): sfweibo['省份前两字'][i] = sfweibo[u'省份名'][i][:2]
sf.to_excel('sf.xlsx',index = False) sfweibo.to_excel('sfweibo.xlsx',index = False)
#连接两表 sf_sfweibo = sf.merge(sfweibo,on = u'省份前两字')
#获取连接后表格中需要的字段名,并重新排列
sf_sfweibo1 = sf_sfweibo.iloc[:,[4,1,2]]
sf_sfweibo1.to_excel('sf_sfweibo.xlsx',index = False)
sf_sfweibo = sf_sfweibo1 sf_sfweibo_All_pivot = pd.merge(sf_sfweibo,All_pivot,left_on = u'微博用户名', right_on = u'用户名',right_index = True)
sf_sfweibo_All_pivot.to_excel('sf_sfweibo_All_pivot.xlsx',index = False)
base = xlsx_file.parse('base_info')
sf_sfweibo_All_pivot_base = base.merge(sf_sfweibo_All_pivot,left_on = u'昵称', right_on = '微博用户名') ssapb = sf_sfweibo_All_pivot_base
ssapb.rename(columns = {u'当月总微博数_x':u'当月总微博数'},inplace = True)
ssapb = ssapb.drop([u'昵称',u'当月总微博数_y'],axis = 1)
ssapb[u'当月原创数'] = ssapb[u'当月总微博数'] - ssapb[u'当月转发数']
linkfix = '?is_ori=1&is_forward=1&is_text=1&is_pic=1&is_video=1&is_music=1&is_
article=1&key_word=&start_time=2017-05-01&end_time=2017-05-31&is_search=1&is_searchadv=1#_0'
ssapb[u'当月博文网址'] = ssapb[u'主页链接'] + linkfix
allfix = '?profile_ftype=1&is_all=1#_0' ssapb[u'全部博文网址'] = ssapb[u'主页链接'] + linkfix
ssapb[u'篇均点赞'] = ssapb[u'点赞数']/ssapb[u'当月总微博数'] ssapb[u'篇均转发'] = ssapb[u'转发数']/ssapb[u'当月总微博数'] ssapb[u'篇均评论'] = ssapb[u'评论数']/ssapb[u'当月总微博数']
ssapb.to_excel('ssapb.xlsx',index = False)
gb = All.groupby(u'用户名') gb1 = gb.size() gbindex = gb1.index
sortAllf = All.sort_values(by = [u'用户名',u'转发数'],ascending = [True,False]) sortAllc = All.sort_values(by = [u'用户名',u'评论数'],ascending = [True,False]) sortAlll = All.sort_values(by = [u'用户名',u'点赞数'],ascending = [True,False])
mm = (sortAllc,sortAllf,sortAlll)
All_h = pd.DataFrame(np.arange(136).reshape(34,4),columns = ['fh','ch','lh','max_hdd'],index = gbindex) fh = [] ch = [] lh = [] max_hdd = []
for j in range(len(mm)): for i in gbindex: tempdf = mm[j][mm[j][u'用户名'] == i] tempdf['hdd'] = tempdf[u'转发数']+ tempdf[u'评论数']+ tempdf[u'点赞数'] max_hdd.append(tempdf['hdd'].max()) tempdf['numf'] = range(len(tempdf)) if j == 0: a = len(tempdf[tempdf[u'转发数'] >= tempdf['numf']+1]) fh.append(a)
elif j == 1:
b = len(tempdf[tempdf[u'评论数'] >= tempdf['numf']+1])
ch.append(b)
else:
c = len(tempdf[tempdf[u'点赞数'] >= tempdf['numf']+1])
lh.append(c)
All_h['fh'] = fh All_h['ch'] = ch All_h['lh'] = lh
All_h['max_hdd'] = max_hdd[:34]
#插入一个综合h指数,该指数是转发/评论/点赞h指数三个的均值
All_h.insert(3,'HS',All_h.iloc[:,:3].mean(1))
All_h.rename(columns = {'fh':u'转发h指数','ch':u'评论h指数','lh':u'点赞h指数',
'HS':u'综合h指数','max_hdd':u'单篇最大互动度'},inplace = True)
ssapb_All_h = pd.merge(ssapb,All_h,left_on = u'微博用户名',right_on = u'用户名',right_index = True)
ssapb_All_h[u'原创率'] = ssapb_All_h[u'当月原创数']/ssapb_All_h[u'当月总微博数']
ssapb_All_h.to_excel('ssapb_All_h.xlsx',index = False)
f1 = ssapb_All_h.loc[:,[u'综合h指数',u'转发h指数',u'评论h指数',u'点赞h指数']]
corr1 = f1.corr()
corr1.to_excel('corr1.xlsx')
f2 = ssapb_All_h.loc[:,[u'综合h指数',u'转发数',u'评论数',u'点赞数',u'篇均转发',u'篇均评论',u'篇均点赞']] corr2 = f2.corr() corr2.to_excel('corr2.xlsx')
f3 = ssapb_All_h.loc[:,[u'综合h指数',u'原创率',u'粉丝数',u'微博总数',u'单篇最大互动度']] corr3 = f3.corr() corr3.to_excel('corr3.xlsx')
#重新排序列 aa = ssapb_All_h.iloc[:,[8,9,10,5,15,16,0,1,2,3,4,6,14,7,11,12,13,24,20,21,22,23,17,18,19,25]] aa.to_excel('finally.xlsx')
f = lambda x:'%.4f'% x
aa.ix[:,21:] = aa.ix[:,21:].applymap(f)
aa.ix[:,21:] = aa.ix[:,21:].astype('float64')
f1 = lambda x:'%.2f%%' % (x*100) aa[[u'原创率']] = aa[[u'原创率']].applymap(f1)
aa.to_excel('finally1.xlsx',index = False)
aa.sort_values(by = u'综合h指数',ascending = False,inplace = True) aa['rank'] = np.arange(34)+1
aa['rank'] = aa['rank'].astype('string_') aa[u'综合h指数'] = aa[u'综合h指数'].astype('string_')
del aa['rank']
aa[u'综合h指数'] = aa[u'综合h指数'].astype('float64') aa.to_excel('finally2.xlsx',index = False)