Run scraper #1
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This is copy of the workflow file run by github actions | |
name: Run scraper | |
on: | |
# Allows you to run this workflow manually from the Actions tab | |
workflow_dispatch: | |
jobs: | |
build: | |
# cf. https://docs.github.com/en/github/setting-up-and-managing-billing-and-payments-on-github/about-billing-for-github-actions | |
runs-on: ubuntu-latest | |
timeout-minutes: 600 # Default: 6hrs, not sufficient for NLP preprocessing | |
defaults: | |
run: | |
working-directory: ./scraper | |
steps: | |
- name: Checkout the repository | |
uses: actions/checkout@main | |
- name: Increase git buffer size | |
run: | | |
git config --global http.postBuffer 1048576000 | |
- name: Pull latest changes | |
run: | | |
git config pull.rebase true | |
git pull origin main | |
- name: Set up Python 3.9 | |
uses: actions/setup-python@v4 | |
with: | |
python-version: 3.9 | |
- name: Install dependencies | |
run: | | |
python -m pip install --upgrade pip | |
pip install -r requirements.txt | |
- name: Download spacy models | |
run: | | |
python -m spacy download en_core_web_sm | |
python -m spacy download fr_core_news_sm | |
python -m spacy download de_core_news_sm | |
python -m spacy download it_core_news_sm | |
- name: Download nltk stopwords | |
run: | | |
python -m nltk.downloader all | |
- name: Run Python script | |
run: | | |
python scraper.py | |
timeout-minutes: 600 # Default: 6hrs, not sufficient for NLP preprocessing | |
env: # Set the secrets as env var | |
CLIENT_SECRET: ${{ secrets.CLIENT_SECRET }} | |
# If main is protected (in repo settings) pushing will fail | |
- name: Push log files to repo | |
uses: github-actions-x/commit@v2.9 | |
with: | |
push-branch: "main" | |
commit-message: "Scraper done" | |
force-add: "true" | |
files: scraper/tools/ | |
name: autoupdate | |
# If main is protected (in repo settings) pushing will fail | |
# Subsequent "on push" Actions are only triggered if this | |
# push is done with a Personal Access Token (PAT) | |
- name: Push contents of data folder to repo | |
uses: github-actions-x/commit@v2.9 | |
with: | |
push-branch: "main" | |
commit-message: "Scraper done" | |
force-add: "true" | |
files: scraper/data/ | |
name: autoupdate |