/
20130105_2
63 lines (45 loc) · 1.43 KB
/
20130105_2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import nltk
import cPickle
words = cPickle.load(open("myData.pickle"))
freq_dist = nltk.FreqDist(words)
freq_dist.keys()[:50]
import re
rt_patterns = re.compile(r"(RT|via)((?:\b\W*@\w+)+)", re.IGNORECASE)
example_tweets=["RT @SocialWebMining Justin Bieber is on SNL 2nite. w00t?!?",
"Justin Bieber is on SNL 2nite. w00t?!? (via @SocialWebMining)"]
for t in example_tweets:
rt_patterns.findall(t)
import networkx as nx
import re
g = nx.DiGraph()
all_tweets = [ tweet
for page in search_results
for tweet in page["results"]]
def get_rt_sources(tweet):
rt_patterns = re.compile(r"(RT|via)((?:\b\W*@\w+)+)", re.IGNORECASE)
return [source.strip()
for tuple in rt_patterns.findall(tweet)
for source in tuple
if source not in ("RT", "via")]
for tweet in all_tweets:
rt_sources = get_rt_sources(tweet["text"])
if not rt_sources: continue
for rt_source in rt_sources:
g.add_edge(rt_source, tweet["from_user"], {"tweet_id" : tweet["id"]})
g.number_of_nodes()
g.number_of_edges()
g.edges(data=True)[0]
len(nx.connected_components(g.to_undirected()))
sorted(nx.degree(g).values())
import networkx as nx
import re
import codecs
OUT = "snl_search_results.dot"
try:
nx.drawing.write_dot(g, OUT)
except ImportError, e:
dot = ['"%s" -> "%s" [tweet_id=%s]' % (n1, n2, g[n1][n2]['tweet_id'])
for n1, n2 in g.edges()]
f = codecs.open(OUT, 'w','utf-8')
f.write('strict digraph {\n%s\n}' % (';\n'.join(dot),))
f.close()