Skip to content

Run scraper

Run scraper #1

# This is copy of the workflow file run by github actions
name: Run scraper
on:
# Allows you to run this workflow manually from the Actions tab
workflow_dispatch:
jobs:
build:
# cf. https://docs.github.com/en/github/setting-up-and-managing-billing-and-payments-on-github/about-billing-for-github-actions
runs-on: ubuntu-latest
timeout-minutes: 600 # Default: 6hrs, not sufficient for NLP preprocessing
defaults:
run:
working-directory: ./scraper
steps:
- name: Checkout the repository
uses: actions/checkout@main
- name: Increase git buffer size
run: |
git config --global http.postBuffer 1048576000
- name: Pull latest changes
run: |
git config pull.rebase true
git pull origin main
- name: Set up Python 3.9
uses: actions/setup-python@v4
with:
python-version: 3.9
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
- name: Download spacy models
run: |
python -m spacy download en_core_web_sm
python -m spacy download fr_core_news_sm
python -m spacy download de_core_news_sm
python -m spacy download it_core_news_sm
- name: Download nltk stopwords
run: |
python -m nltk.downloader all
- name: Run Python script
run: |
python scraper.py
timeout-minutes: 600 # Default: 6hrs, not sufficient for NLP preprocessing
env: # Set the secrets as env var
CLIENT_SECRET: ${{ secrets.CLIENT_SECRET }}
# If main is protected (in repo settings) pushing will fail
- name: Push log files to repo
uses: github-actions-x/commit@v2.9
with:
push-branch: "main"
commit-message: "Scraper done"
force-add: "true"
files: scraper/tools/
name: autoupdate
# If main is protected (in repo settings) pushing will fail
# Subsequent "on push" Actions are only triggered if this
# push is done with a Personal Access Token (PAT)
- name: Push contents of data folder to repo
uses: github-actions-x/commit@v2.9
with:
push-branch: "main"
commit-message: "Scraper done"
force-add: "true"
files: scraper/data/
name: autoupdate