### 数据清洗
#### 一般而言是将数据转化为数据框样式
#### 基本逻辑
- 清洗错误行
- 正确分列
- 提取所要分析的内容
- 介绍通过按行、chunk方式对大规模数据进行预处理

#### 同时考虑分列符和引用符
- 分列符/分隔符：sep，delimiter
- 引用符：quotechar

> A,B,C
> 1,"2,3",4
> 5,6,7

这里面逗号是分隔符，双引号是引用符

In [None]:
# ps：在学习这一章的时候没有找到相关数据资源所以忽略FileNotFound的bug
with open("./data/ows_tweets_sample.txt", 'r') as f:
    chunk = f.readlines()

In [None]:
import csv
lines_csv = csv.reader(chunk, delimiter=',', quotechar='"') 
print(len(list(lines_csv)))
# next(lines_csv)
# next(lines_csv)

In [None]:
import pandas as pd
df = pd.read_csv("./data/ows_tweets_sample.txt",
                 sep = ',', quotechar='"')
df[:3]

In [None]:
df.Text[0]
# 获取第一行的‘Text’列的值

In [None]:
df['From User'][:10]
# 获取前十行的‘From User’值

#### SCV和pandas解析数据方式比较

csv专门用来读取csv数据，并返回一个迭代器（列表）包含每一行数据，只能读取一次数据

pandas提供了DataFrame数据结构（二维带标签、类似于表格）
pandas的read_csv()方法则将整个csv文件自动转换成DataFrame，并提供多种数据操作方法

#### 统计发帖数量所对应的人数的分布

In [None]:
from collections import defaultdict
# defaultdict：特殊字典，可以指定默认值类型，当访问不存在的键时会自动创建对应类型的默认值
data_dict = defaultdict(int)
for i in df['From User']:
    data_dict[i] +=1 

In [None]:
list(data_dict.items())[:5]
# data_dict.items()包含所有键值对的字典
# list(data_dict.items()) 将字典转换为列表

##### output
[('Anonops_Cop', 1),

 ('KittyHybrid', 1),
 
 ('nerdsherpa', 2),
 
 ('hamudistan', 1),
 
 ('kl_knox', 1)]

In [None]:
import matplotlib.pyplot as plt

plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']  # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号, 注意['SimHei']对应这句不行.

plt.style.use('ggplot') # 设置表格样式

In [None]:
# 使用直方图hist绘制data_lict字典的所有值（发布n篇帖子的用户人数有n个）
plt.hist(data_dict.values())
# 将y轴和x轴的刻度设置为对数刻度
plt.yscale('log')
plt.xscale('log')
# 设置标签文字和字体大小
plt.xlabel(u'发帖数', fontsize = 20)
plt.ylabel(u'人数', fontsize = 20)
plt.show()

##### ouput
![image.png](attachment:image.png)

In [None]:
tweet_dict = defaultdict(int)
for i in data_dict.values():
    tweet_dict[i] += 1 
 
plt.loglog(list(tweet_dict.keys()), list(tweet_dict.values()), 'ro')
#linewidth=2)  
# 将键值分别转换成列表，‘ro’表示散点图

plt.xlabel(u'推特数', fontsize=20)
plt.ylabel(u'人数', fontsize=20 )
plt.show()

##### output
![image.png](attachment:image.png)

In [None]:
import numpy as np
import statsmodels.api as sm

# 该函数用于绘制散点图和拟合曲线（以探索给定数据的幂律关系）
# 四个参数：x轴数据、y轴数据、散点图颜色、散点图标记样式
def powerPlot(d_value, d_freq, color, marker):
    d_freq = [i + 1 for i in d_freq] # 避免出现零频率
    d_prob = [float(i)/sum(d_freq) for i in d_freq] # 计算每个频率值的概率
    #d_rank = ss.rankdata(d_value).astype(int)
    
    # 取对数
    x = np.log(d_value)
    y = np.log(d_prob)
    
    #线性回归分析：常数项、斜率、R-squared值
    # sm.add_constant 函数：在数据前面添加一个常数列，以便在回归模型中考虑截距
    xx = sm.add_constant(x, prepend=True)
    # 使用OLS类进行线性回归分析：y是因变量，xx是自变量，fit()方法拟合
    res = sm.OLS(y,xx).fit()
    # 从回归结果对象res中获取拟合结果参数：constant（截距）；beta（斜率）
    constant,beta = res.params
    # 从回归结果对象res中获取R-squared值（决定系数）（越接近1拟合度越好）
    r2 = res.rsquared
    
    # 绘制散点图和拟合曲线
    
    # 绘制散点图
    # 参数：发帖数（x轴），概率值（y轴），不连接散点，散点颜色，散点标记样式
    plt.plot(d_value, d_prob, linestyle = '',color = color, marker = marker)
    
    # 绘制拟合曲线
    # 参数：发帖数（x轴），拟合曲线，曲线颜色
    # np.exp() 函数用于计算指数，将对数变换后的值转换为原始线性值
    plt.plot(d_value, np.exp(constant+x*beta),"red")
    
    
    plt.xscale('log')
    plt.yscale('log')
    
    # 添加文本标注
    plt.text(max(d_value)/2,max(d_prob)/10, # 位置
             r'$\beta$ = ' + str(round(beta,2)) +'\n' + r'$R^2$ = ' + str(round(r2, 2)), fontsize = 20)

In [None]:
# np.histogram计算给定数据的直方图； 参数15指定bin数量（将数据分成一定的组）
# histo是频率数组：每个bin中的数据数量
# bin_edges是分bin边界的数据
histo, bin_edges = np.histogram(list(data_dict.values()), 15)
# 计算bin中心值：每个bin的上下边界相加并除以2
bin_center = 0.5*(bin_edges[1:] + bin_edges[:-1])
powerPlot(bin_center,histo, 'r', '^')

#lg=plt.legend(labels = [u'Tweets', u'Fit'], loc=3, fontsize=20)
plt.ylabel(u'概率', fontsize=20)
plt.xlabel(u'推特数', fontsize=20) 
plt.show()

##### output
![image.png](attachment:image.png)

In [None]:
import statsmodels.api as sm
from collections import defaultdict
import numpy as np

def powerPlot2(data):
    d = sorted(data, reverse = True )
    d_table = defaultdict(int)# 创建一个默认值为0的字典用于存储数据频率
    for k in d:
        d_table[k] += 1
    d_value = sorted(d_table)# 对频率字典的键进行排序
    
    d_value = [i+1 for i in d_value] 
    d_freq = [d_table[i]+1 for i in d_value] #避免频率为0
    d_prob = [float(i)/sum(d_freq) for i in d_freq] # 计算概率
    
    x = np.log(d_value)
    y = np.log(d_prob)
    xx = sm.add_constant(x, prepend=True)# 将对数坐标数据加上常数项（截距）
    res = sm.OLS(y,xx).fit()# 使用最小二乘法对数据进行拟合
    constant,beta = res.params #从拟合结果对象中获取常数项和斜率
    r2 = res.rsquared # 获取拟合程度
    
    plt.plot(d_value, d_prob, 'ro') # 绘制散点图
    plt.plot(d_value, np.exp(constant+x*beta),"red") # 绘制拟合曲线
    plt.xscale('log'); plt.yscale('log') # 设置对数刻度
    plt.text(max(d_value)/2,max(d_prob)/5, # 设置文本位置与内容
             'Beta = ' + str(round(beta,2)) +'\n' + 'R squared = ' + str(round(r2, 2)))
    
    plt.title('Distribution')
    plt.ylabel('P(K)')
    plt.xlabel('K')
    plt.show()
    

In [None]:
powerPlot2(data_dict.values())

##### output
![image.png](attachment:image.png)

### 功率律分布图

在图表上通常是对数-对数坐标轴，因为功率律分布在对数坐标下呈现出直线关系。

通常有两个参数：指数（alpha）和最小阈值（xmin）。

指数表示了分布的形状，如果 alpha 大于 1，则表示长尾分布。

最小阈值 xmin 表示数据的截断点，即分布的起始点。

In [None]:
import powerlaw #powerlaw 用于拟合功率率分布
def plotPowerlaw(data,ax,col,xlab): # 参数：数据、坐标轴、颜色、x轴标签
    
    # data：输入数据；xmin：指定最小阈值；fit：可以用于绘制拟合曲线的对象
    fit = powerlaw.Fit(data,xmin=2)
    #fit = powerlaw.Fit(data)
    
    # 绘制原始数据的功率律分布概率密度函数（PDF）图
    fit.plot_pdf(color = col, linewidth = 2)
    # 从拟合结果中获取功率律的指数 alpha 和最小阈值 xmin。
    a,x = (fit.power_law.alpha,fit.power_law.xmin)
    # 绘制拟合的功率曲线
    fit.power_law.plot_pdf(color = col, linestyle = 'dotted', ax = ax, \
                            label = r"$\alpha = %d \:\:, x_{min} = %d$" % (a,x))
    ax.set_xlabel(xlab, fontsize = 20)
    ax.set_ylabel('$Probability$', fontsize = 20)
    plt.legend(loc = 0, frameon = False)# 添加图例，显示拟合结果的标签。

In [None]:
from collections import defaultdict
data_dict = defaultdict(int)

for i in df['From User']:
    data_dict[i] += 1

In [None]:
# 
import matplotlib.cm as cm
cmap = cm.get_cmap('rainbow_r',6)

fig = plt.figure(figsize=(6, 4),facecolor='white')
ax = fig.add_subplot(1, 1, 1)
plotPowerlaw(list(data_dict.values()), ax,cmap(1), 
             '$Tweets$')

##### output
/opt/anaconda3/lib/python3.7/site-packages/powerlaw.py:700: RuntimeWarning: invalid value encountered in true_divide
  (Theoretical_CDF * (1 - Theoretical_CDF))
/opt/anaconda3/lib/python3.7/site-packages/powerlaw.py:700: RuntimeWarning: invalid value encountered in true_divide
  (Theoretical_CDF * (1 - Theoretical_CDF))
  
  ![image.png](attachment:image.png)

### 清洗tweets文本

In [None]:
tweet = '''RT @AnonKitsu: ALERT!!!!!!!!!!COPS ARE KETTLING PROTESTERS IN PARK W HELICOPTERS AND PADDYWAGONS!!!! 
            #OCCUPYWALLSTREET #OWS #OCCUPYNY PLEASE @chengjun @mili http://computational-communication.com 
            http://ccc.nju.edu.cn RT !!HELP!!!!'''

In [None]:
#!pip install twitter-text
import re
import twitter_text 
# https://github.com/dryan/twitter-text-py/issues/21
#Macintosh HD ▸ 用户 ▸ datalab ▸ 应用程序 ▸ anaconda ▸ lib ▸ python3.5 ▸ site-packages

In [None]:
import re

tweet = '''RT @AnonKitsu: @who ALERT!!!!!!!!!!COPS ARE KETTLING PROTESTERS IN PARK W HELICOPTERS AND PADDYWAGONS!!!! 
            #OCCUPYWALLSTREET #OWS #OCCUPYNY PLEASE @chengjun @mili http://computational-communication.com 
            http://ccc.nju.edu.cn RT !!HELP!!!!'''

rt_patterns = re.compile(r"(RT|via)((?:\b\W*@\w+)+)", re.IGNORECASE)
rt_user_name = rt_patterns.findall(tweet)[0][1]#.strip(' @').split(':')[0]
rt_user_name 

##### output
' @AnonKitsu: @who'

In [None]:
import re

tweet = '''RT @AnonKitsu: @who ALERT!!!!!!!!!!COPS ARE KETTLING PROTESTERS IN PARK W HELICOPTERS AND PADDYWAGONS!!!! 
            #OCCUPYWALLSTREET #OWS #OCCUPYNY PLEASE @chengjun @mili http://computational-communication.com 
            http://ccc.nju.edu.cn RT !!HELP!!!!'''

rt_patterns = re.compile(r"(RT|via)((?:\b\W*@\w+)+)", \
                         re.IGNORECASE)
rt_user_name = rt_patterns.findall(tweet)[0][1].strip(' @').split(':')[0]
rt_user_name

##### output
'AnonKitsu'

In [None]:
import re

tweet = '''@chengjun:@who ALERT!!!!!!!!!!COPS ARE KETTLING PROTESTERS IN PARK W HELICOPTERS AND PADDYWAGONS!!!! 
            #OCCUPYWALLSTREET #OWS #OCCUPYNY PLEASE @chengjun @mili http://computational-communication.com 
            http://ccc.nju.edu.cn RT !!HELP!!!!'''

rt_patterns = re.compile(r"(RT|via)((?:\b\W*@\w+)+)", re.IGNORECASE)
rt_user_name = rt_patterns.findall(tweet)
print(rt_user_name)

if rt_user_name:
    print('it exits.')
else:
    print('None')

##### output
[]
None

In [None]:
import re

def extract_rt_user(tweet):
    rt_patterns = re.compile(r"(RT|via)((?:\b\W*@\w+)+)", re.IGNORECASE)
    rt_user_name = rt_patterns.findall(tweet)
    if rt_user_name:
        rt_user_name = rt_user_name[0][1].strip(' @').split(':')[0]
    else:
        rt_user_name = None
    return rt_user_name

In [None]:
tweet = '''RT @chengjun: ALERT!!!!!!!!!!COPS ARE KETTLING PROTESTERS IN PARK W HELICOPTERS AND PADDYWAGONS!!!! 
            #OCCUPYWALLSTREET #OWS #OCCUPYNY PLEASE @chengjun @mili http://computational-communication.com 
            http://ccc.nju.edu.cn RT !!HELP!!!!'''

extract_rt_user(tweet)    

##### output
'chengjun'

In [None]:
tweet = '''@chengjun: ALERT!!!!!!!!!!COPS ARE KETTLING PROTESTERS IN PARK W HELICOPTERS AND PADDYWAGONS!!!! 
            #OCCUPYWALLSTREET #OWS #OCCUPYNY PLEASE @chengjun @mili http://computational-communication.com 
            http://ccc.nju.edu.cn RT !!HELP!!!!'''

print(extract_rt_user(tweet) )

##### output
None

In [None]:
import csv

with open("./data/ows_tweets_sample.txt", 'r') as f:
    chunk = f.readlines()
    
rt_network = []
lines = csv.reader(chunk[1:], delimiter=',', quotechar='"')
tweet_user_data = [(i[1], i[8]) for i in lines]
tweet_user_data[:3]

##### output

[

('RT @AnonKitsu: ALERT!!!!!!!!!!COPS ARE KETTLING PROTESTERS IN PARK W HELICOPTERS AND PADDYWAGONS!!!! #OCCUPYWALLSTREET #OWS #OCCUPYNY PLEASE RT !!HELP!!!!',
  'Anonops_Cop'),
  
 ('@jamiekilstein @allisonkilkenny Interesting interview (never aired, wonder why??) by Fox with #ows protester http://t.co/Fte55Kh7',
  'KittyHybrid'),
  
 ("@Seductivpancake Right! Those guys have a victory condition: regime change. #ows doesn't seem to have a goal I can figure out.",
  'nerdsherpa')]

In [None]:
from collections import defaultdict

rt_network = []
rt_dict = defaultdict(int)
for k, i  in enumerate(tweet_user_data):
    tweet,user = i
    rt_user = extract_rt_user(tweet)
    if rt_user:
        rt_network.append((user, rt_user)) #(rt_user,'  ', user, end = '\n')
        rt_dict[(user, rt_user)] += 1
#rt_network[:5]
list(rt_dict.items())[:3]

##### output
[(('Anonops_Cop', 'AnonKitsu'), 1),

 (('hamudistan', 'bembel'), 1),
 
 (('vickycrampton', 'TheNewDeal'), 2)]

### 获取清洗过的推特文本

In [2]:
# 不包含人名、url、各种符号
def extract_tweet_text(tweet, at_names, urls):
    for i in at_names:
        tweet = tweet.replace(i, '')
    for j in urls:
        tweet = tweet.replace(j, '')
    marks = ['RT @', '@', '&quot;', '#', '\n', '\t', '  ']
    for k in marks:
        tweet = tweet.replace(k, '')
    return tweet

In [None]:
!pip install twitter-text

In [None]:
import twitter_text

tweet = '''RT @AnonKitsu: ALERT!!!!!!!!!!COPS ARE KETTLING PROTESTERS IN PARK W HELICOPTERS AND PADDYWAGONS!!!! 
            #OCCUPYWALLSTREET #OWS #OCCUPYNY PLEASE @chengjun @mili https://computational-communication.com 
            http://ccc.nju.edu.cn RT !!HELP!!!!'''

ex = twitter_text.Extractor(tweet)
at_names = ex.extract_mentioned_screen_names()
urls = ex.extract_urls()
hashtags = ex.extract_hashtags()
rt_user = extract_rt_user(tweet)
tweet_text = extract_tweet_text(tweet, at_names, urls)

print(at_names, urls, hashtags, rt_user,'-------->', tweet_text)

##### output
['AnonKitsu', 'chengjun', 'mili'] ['https://computational-communication.com', 'http://ccc.nju.edu.cn'] ['OCCUPYWALLSTREET', 'OWS', 'OCCUPYNY'] AnonKitsu --------> : ALERT!!!!!!!!!!COPS ARE KETTLING PROTESTERS IN PARK W HELICOPTERS AND PADDYWAGONS!!!! OCCUPYWALLSTREET OWS OCCUPYNY PLEASE RT !!HELP!!!!

In [None]:
import csv

lines = csv.reader(chunk,delimiter=',', quotechar='"')
tweets = [i[1] for i in lines]

In [None]:
for tweet in tweets[:5]:
    ex = twitter_text.Extractor(tweet)
    at_names = ex.extract_mentioned_screen_names()
    urls = ex.extract_urls()
    hashtags = ex.extract_hashtags()
    rt_user = extract_rt_user(tweet)
    #tweet_text = extract_tweet_text(tweet, at_names, urls)

    print(at_names, urls, hashtags, rt_user)
    #print(tweet_text)

##### output

[] [] [] None
['AnonKitsu'] [] ['OCCUPYWALLSTREET', 'OWS', 'OCCUPYNY'] AnonKitsu
['jamiekilstein', 'allisonkilkenny'] ['http://t.co/Fte55Kh7'] ['ows'] None
['Seductivpancake'] [] ['ows'] None
['bembel'] ['http://j.mp/rhHavq'] ['OccupyWallStreet', 'OWS'] bembel