/
generateSponsors.js
122 lines (110 loc) · 3.36 KB
/
generateSponsors.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
require('dotenv').config()
const keys = require('lodash/keys')
const db = require('monk')(process.env.MONGO_DB)
const posts = db.get('posts')
const rp = require('request-promise')
const moment = require('moment')
const async = require('async')
const request = require('request')
const jsdom = require('jsdom');
const { JSDOM } = jsdom
const SET_NULL_AFTER_DAYS = 30 // The number of days after script sets transcript URL to null if not set yet
const CONCURRENCY = 5
let counter = 0
let q = async.queue(function(post, callback) {
date_now = moment.utc();
request({
uri: post.link,
}, function(error, response, body) {
try {
let sponsorsContent = body.split('<h3>Sponsors</h3>')
let _sponsors = []
let sponsorImageEls = []
let sponsorImageKeys = []
let dom
if (sponsorsContent.length == 1) {
sponsorsContent = body.split('<h2>Sponsors</h2>')
}
if (sponsorsContent.length == 2) {
let sponsors = sponsorsContent[1].trim()
sponsorsNoWhiteSpaces = sponsors.replace(/\>\s+\</g,'><')
let sponsorsCut = sponsorsNoWhiteSpaces.split('</div></div><div class="col-xs-12 col-md-6 col-lg-3">')
sponsorsContent = sponsorsCut[0]
// Parse sponsor data
dom = new JSDOM(sponsorsContent)
sponsorImageEls = dom.window.document.querySelectorAll('img')
sponsorImageKeys = keys(sponsorImageEls)
sponsors = sponsorImageKeys.map(imageKey => {
let img = sponsorImageEls[imageKey]
let imageSrc = img.getAttribute('src') || ''
return {
image: imageSrc.split('?')[0],
url: img.parentElement.getAttribute('href') || '',
}
})
posts.update({id: post.id}, {
$set: {
sponsorsContent,
sponsors,
},
})
.then((result) => {
console.log('success updating', post.title["rendered"]);
counter ++
callback();
})
.catch((error) => {
callback(error);
})
} else {
time_diff = date_now.diff(moment(post.date), 'days')
if (time_diff > SET_NULL_AFTER_DAYS) {
posts.update({id: post.id}, {
$set: {
"sponsorsContent": null
},
})
.then((result) => {
counter ++
console.log('Update null to: ', post.title["rendered"]);
callback();
})
.catch((error) => {
callback(error);
})
} else {
// Posts without sponsors but younger than 30 days:
console.log('Skipped post without sponsors, younger than 30 days:', post.title["rendered"])
callback();
}
}
} catch (e) {
console.log(e);
callback();
}
});
}, CONCURRENCY);
q.drain = function() {
console.log('all items have been processed');
db.close();
};
let progress = 0
posts.count({ sponsors: { $exists: false } })
.then((c) => {
if (c > 0) {
posts.find({ sponsors: { $exists: false } })
.each((post) => {
q.push(post, function (err) {
if (err) {
console.log(err);
} else {
progress = Math.round((counter/c)*100)
console.log("PROGRESS: " + progress + "%");
}
});
})
} else {
console.log("All posts updated.");
process.exit();
}
})