-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprojectnew.py
224 lines (186 loc) · 6.57 KB
/
projectnew.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
#!/usr/bin/env python3
# coding: utf-8
# SPDX-License-Identifier: AGPL-3.0-or-later
# Copyright 2020 AntiCompositeNumber
import flask
import pywikibot
import pywikibot.data.mysql
import pywikibot.data.api
from datetime import timedelta, datetime
from collections import defaultdict
from typing import List, NamedTuple, Iterator, Tuple, Dict
bp = flask.Blueprint("projectnew", __name__, url_prefix="/projectnew")
site = pywikibot.Site("en", "wikipedia")
Article = NamedTuple("Article", [("title", str), ("quality", str), ("author", str)])
# Test if running from Wikimedia Cloud Services (Toolforge)
# If true, a database connection can be assumed
try:
f = open("/etc/wmcs-project")
except FileNotFoundError:
wmcs = False
else:
f.close()
wmcs = True
def get_new_category_pages(
category: pywikibot.Category,
period: timedelta = timedelta(days=-7),
namespaces: List[int] = [1],
) -> List[Tuple[pywikibot.page.BasePage, datetime]]:
"""Return a list of pages recently added to a category,
ordered by addition date. Switches between API and DB connection.
args:
category -- pywikibot.Category that should be checked
kwargs:
period -- negative datetime.timedelta, default timedelta(days=-7)
pages are listed between (now + period) and now
namespaces -- list of ints of talk namespaces (default: [1])
"""
server_day = site.server_time().replace(hour=0, minute=0, second=0, microsecond=0)
start_time = server_day + period
end_time = server_day
try:
pages = list(
_db_get_new_category_pages(category, start_time, end_time, namespaces)
)
except ConnectionError:
pages = list(
_api_get_new_category_pages(category, start_time, end_time, namespaces)
)
return pages
def _api_get_new_category_pages(
category: pywikibot.Category,
start_time: pywikibot.Timestamp,
end_time: pywikibot.Timestamp,
namespaces: List[int],
) -> Iterator[Tuple[pywikibot.page.BasePage, pywikibot.Timestamp]]:
"""Use API to list category pages. Called by get_new_categoryPages()"""
for row in pywikibot.data.api.ListGenerator(
"categorymembers",
site=site,
cmtitle=category.title(underscore=True, with_ns=True),
cmprop="title|type|timestamp",
cmnamespace="|".join(str(n) for n in namespaces),
cmtype="page",
cmsort="timestamp",
cmstart=start_time.isoformat(),
cmend=end_time.isoformat(),
):
if row.get("type", "page") != "page":
continue
yield (
pywikibot.Page(site, title=row.get("title", ""), ns=row.get("ns", "")),
pywikibot.Timestamp.fromISOformat(row.get("timestamp")),
)
def _db_get_new_category_pages(
category: pywikibot.Category,
start_time: pywikibot.Timestamp,
end_time: pywikibot.Timestamp,
namespaces: List[int],
) -> Iterator[Tuple[pywikibot.page.BasePage, datetime]]:
"""Use DB to list category pages. Called by get_new_categoryPages()"""
if not wmcs:
raise ConnectionError
query = (
"SELECT page_namespace, page_title, cl_timestamp "
"FROM "
" categorylinks "
" JOIN page ON page_id = cl_from "
"WHERE "
' cl_to = "{catname}" AND '
' cl_type = "page" AND '
" cl_timestamp >= {start_timestamp} AND "
" cl_timestamp < {end_timestamp} AND "
" page_namespace in ({nslist}) "
"ORDER BY cl_timestamp "
).format(
catname=category.title(underscore=True, with_ns=False),
start_timestamp=start_time.totimestampformat(),
end_timestamp=end_time.totimestampformat(),
nslist=", ".join(str(n) for n in namespaces),
)
for ns, title, ts in pywikibot.data.mysql.mysql_query(query, dbname=site.dbName()):
yield (
pywikibot.Page(site, title=title.decode(encoding="utf-8"), ns=ns),
ts,
)
def filter_pages(
pages: List[Tuple[pywikibot.page.BasePage, pywikibot.Timestamp]],
redirects=False,
deleted=False,
) -> List[Tuple[pywikibot.page.BasePage, pywikibot.Timestamp]]:
filtered = []
for page, ts in pages:
if page.isTalkPage():
page = page.toggleTalkPage()
if not redirects:
if page.isRedirectPage():
continue
if not deleted:
if not page.exists():
continue
filtered.append((page, ts))
return filtered
def get_article_metadata(page: pywikibot.page.BasePage) -> Article:
"""Returns article creator and content assesment class as a NamedTuple"""
return Article(
title=page.title(),
quality=get_article_quality(page),
author=page.oldest_revision.user,
)
def get_article_quality(page: pywikibot.page.BasePage) -> str:
"""Gets the content assement class from a wikiproject template"""
if not page.isTalkPage():
page = page.toggleTalkPage()
classes = []
valid_classes = {
"fa": 1,
"fl": 1,
"a": 2,
"ga": 3,
"b": 4,
"c": 5,
"start": 6,
"stub": 7,
"list": 8,
"draft": 8,
}
for value in page.templatesWithParams():
for para in value[1]:
key, sep, value = para.partition("=")
if sep and key.lower() == "class" and value.lower() in valid_classes:
classes.append(value.lower())
classes = sorted(classes, key=valid_classes.__getitem__)
if classes:
return classes[0]
else:
return "unassessed"
def main(cat_name: str):
category = pywikibot.Category(site, cat_name)
raw_pages = get_new_category_pages(category)
pages = filter_pages(raw_pages)
output: Dict[str, List[Article]] = defaultdict(list)
for page, ts in pages:
data = get_article_metadata(page)
date = ts.date().isoformat()
output[date].append(data)
return output
@bp.route("/api/json/<category>")
def api_json(category):
pages = main(category)
# convert namedtuples to dicts, since jsonify would convert them to tuples
return flask.jsonify(
{key: [item._asdict() for item in value] for key, value in pages.items()}
)
@bp.route("/api/wikitext/<category>")
def api_wikitext(category):
pages = main(category)
wikitext = ""
for date, values in pages.items():
wikitext += f"'''{date}'''\n"
for article in values:
wikitext += (
"* {{Article status"
f"|{article.quality}|{article.title}|{article.author}"
"}}\n"
)
return wikitext