-
Notifications
You must be signed in to change notification settings - Fork 0
/
HW7.py
171 lines (119 loc) · 4.87 KB
/
HW7.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
# Amy Finnegan
# HW7
# libraries for database
import sqlalchemy
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String, ForeignKey, and_, or_, func
from sqlalchemy.orm import relationship, backref, sessionmaker, aliased
from sqlalchemy.types import TypeDecorator, Unicode
# libraries for web crawler
from nltk.util import clean_html
import urllib2
import time
from bs4 import BeautifulSoup
#Some info about sqlalchemy
print sqlalchemy.__version__
#Connect to the local database, can use :memory: to just try it out in memory, it doesn't have to exist
engine = sqlalchemy.create_engine('sqlite:///HW7.db', echo=False)
# echo true is debug information
Base = declarative_base()
# Parent -- sources have many posts, a "has many" relationship
class Source(Base):
__tablename__ = "source"
id = Column(Integer, primary_key=True)
name = Column(String)
site_url = Column(String)
posts = relationship("Post", backref="source")
#players = relationship("Player", backref="team") # function in the base , backref: you need to send me info back, rel needs to be two ways for it to work
def __init__(self, name, site_url):
self.name = name
self.site_url = site_url
def __repr__(self):
return "Site crawled: %s, <%s>" % (self.name, self.site_url)
#Child -- Define some schemas, straight from documentation, inhereit from declarative_base()
class Post(Base):
__tablename__ = 'scrapes'
#Have an ID column because player attributes (name, etc) are not unique
id = Column(Integer, primary_key=True) # this just increments in the order you create them, stays unique
publish_date = Column(String)
url = Column(String)
post_title = Column(String)
comment_count = Column(Integer)
source_id = Column(Integer, ForeignKey("source.id")) # source table's id column
def __init__(self, publish_date, url, post_title, comment_count, source=None):
self.publish_date = publish_date
self.url = url
self.post_title = post_title
self.comment_count = comment_count
def __repr__(self):
return "%s, posted on %s" % (self.post_title, self.publish_date)
#First time create tables
Base.metadata.create_all(engine)
#Create a session to actually store things in the db
Session = sessionmaker(bind=engine)
session = Session()
## Crawling ##
# What page?
# Get a list of pages
pages_to_scrape = []
first = 'http://onlinecrackerbarrel.wordpress.com'
pages_to_scrape.append(first)
for i in range(2, 6):
next = first + "/page/" + str(i)
pages_to_scrape.append(next)
#j = 1
for i in range(len(pages_to_scrape)):
time.sleep(2)
# Open webpage
# for j in range(len(pages_to_scrape)):
page_to_scrape = pages_to_scrape[i]
s = Source("onlinecrackerbarrel", page_to_scrape)
session.add(s)
session.commit()
webpage = urllib2.urlopen(page_to_scrape)
#j += 1
# Parse it
soup = BeautifulSoup(webpage.read())
soup.prettify()
# extract all post_titles, publish_dates, comment_counts, urls
post_title = soup.findAll("h2", attrs={'class':'entry-title'})
publish_date = soup.findAll("div", attrs={'class':'postMeta'})
comment_count = soup.findAll("p", attrs={'class':'container'})
urls = soup.findAll("h2", class_="entry-title")
for i in range(10):
date = publish_date[i]
d = clean_html(str(date.find("span", attrs={'class':'date'})))
# d_clean = d.encode('ascii', 'ignore')
d_clean = d.decode('utf-8')
url = urls[i]
u = url('a')[0]['href']
# u_clean = u.encode('ascii', 'ignore')
u_clean = u.decode('utf-8')
post = post_title[i]
p = clean_html(str(post.find("a")))
# p_clean = p.encode('ascii', 'ignore')
p_clean = p.decode('utf-8')
comment = comment_count[i]
c = clean_html(str(comment.find("span", attrs={'class':'comments'})))
# c_clean = c.encode('ascii', 'ignore')
c_clean = c.decode('utf-8')
entry = Post(d_clean, u_clean, p_clean, c_clean)
entry.source = s
session.add(entry)
#session.commit()
#Persist all of this information
session.commit()
Source.__table__
Post.__table__
# show me all the post titles with id and source info
for post in session.query(Post).order_by(Post.id):
print post.id, post.source, "{0}".format(post.post_title.encode('ascii', 'ignore'))
# Order by post publish date
for post in session.query(Post).order_by(Post.publish_date):
print "{0}: {1}".format(post.publish_date.encode('ascii', 'ignore'), post.post_title.encode('ascii', 'ignore'))
# # show me the sources
for source in session.query(Source):
print "{0}: {1}".format(source.name.encode('ascii', 'ignore'), source.site_url.encode('ascii', 'ignore'))
# query all from one source (This isn't working, it just shows me all of them.)
for post, source in session.query(Post, Source).filter(Source.site_url.like("%page/5%")):
print "{0}: {1}".format(post.post_title.encode('ascii', 'ignore'), post.source.site_url)