-
Notifications
You must be signed in to change notification settings - Fork 0
/
trump_scrape.py
95 lines (78 loc) · 2.75 KB
/
trump_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import numpy as np
import pandas as pd
import requests
import bs4
import os
base_url = "http://www.presidency.ucsb.edu/"
trump_speech_list = "http://www.presidency.ucsb.edu/2016_election_speeches.php?candidate=45&campaign=2016TRUMP&doctype=5000"
hillary_speech_list = "http://www.presidency.ucsb.edu/2016_election_speeches.php?candidate=70&campaign=2016CLINTON&doctype=5000"
if not os.path.isdir('trump_data'):
os.mkdir('trump_data')
# Grab page
res = requests.get(trump_speech_list)
scrape = bs4.BeautifulSoup(res.text, 'lxml')
# Get list of speeches
elems = scrape.select('td.listdate a')
links = []
for e in elems:
links.append((e.text, e.attrs['href']))
# Grab individual speeches
speeches = []
for idx, link in enumerate(links):
print "Grabbing speech: ", idx + 1
url = base_url + link[1][3:]
res = requests.get(url)
scrape = bs4.BeautifulSoup(res.text, 'lxml')
speech = scrape.select('span.displaytext')[0].text.encode('utf-8')
speeches.append(speech)
with open(os.path.join("trump_data",
"speech_" + str(idx) + ".txt"), "w") as text_file:
text_file.write(link[0])
text_file.write('\n')
text_file.write(speech)
text_file.write('\n')
# Create dataframe
d = {'title' : pd.Series([l[0] for l in links]),
'link' : pd.Series([base_url + l[1][3:] for l in links]),
'speech': pd.Series(speeches)}
df = pd.DataFrame(d)
with open(os.path.join("data", "full_speech.txt"), "w") as txt:
for s in speeches:
txt.write("%s\n" % s)
if __name__ == '__main__':
df
if not os.path.isdir('hillary_data'):
os.mkdir('hillary_data')
# Grab page
res = requests.get(hillary_speech_list)
scrape = bs4.BeautifulSoup(res.text, 'lxml')
# Get list of speeches
elems = scrape.select('td.listdate a')
links = []
for e in elems:
links.append((e.text, e.attrs['href']))
# Grab individual speeches
speeches = []
for idx, link in enumerate(links):
print "Grabbing speech: ", idx + 1
url = base_url + link[1][3:]
res = requests.get(url)
scrape = bs4.BeautifulSoup(res.text, 'lxml')
speech = scrape.select('span.displaytext')[0].text.encode('utf-8')
speeches.append(speech)
with open(os.path.join("hillary_data",
"speech_" + str(idx) + ".txt"), "w") as text_file:
text_file.write(link[0])
text_file.write('\n')
text_file.write(speech)
text_file.write('\n')
# Create dataframe
d = {'title' : pd.Series([l[0] for l in links]),
'link' : pd.Series([base_url + l[1][3:] for l in links]),
'speech': pd.Series(speeches)}
df = pd.DataFrame(d)
with open(os.path.join("data", "full_speech.txt"), "w") as txt:
for s in speeches:
txt.write("%s\n" % s)
if __name__ == '__main__':
df