Run scraper and preprocessing #35
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This is copy of the workflow file run by github actions | |
name: Run scraper and preprocessing | |
on: | |
schedule: | |
- cron: "0 0 * * 0" | |
# Allows you to run this workflow manually from the Actions tab | |
workflow_dispatch: | |
jobs: | |
scrape: | |
# cf. https://docs.github.com/en/github/setting-up-and-managing-billing-and-payments-on-github/about-billing-for-github-actions | |
runs-on: ubuntu-latest | |
defaults: | |
run: | |
working-directory: ./scraper | |
steps: | |
- name: Checkout the repository | |
uses: actions/checkout@main | |
- name: Increase git buffer size | |
run: | | |
git config --global http.postBuffer 1048576000 | |
- name: Pull latest changes | |
run: | | |
git config pull.rebase true | |
git pull origin main | |
- name: Set up Python 3.9 | |
uses: actions/setup-python@v4 | |
with: | |
python-version: 3.9 | |
- name: Install dependencies | |
run: | | |
python -m pip install --upgrade pip | |
pip install -r requirements.txt | |
- name: Download spacy models | |
run: | | |
python -m spacy download en_core_web_sm | |
python -m spacy download fr_core_news_sm | |
python -m spacy download de_core_news_sm | |
python -m spacy download it_core_news_sm | |
- name: Download nltk stopwords | |
run: | | |
python -m nltk.downloader all | |
- name: Run Python script | |
run: | | |
python scraper.py | |
env: # Set the secrets as env var | |
CLIENT_SECRET: ${{ secrets.CLIENT_SECRET }} | |
- uses: actions/upload-artifact@v4 | |
with: | |
name: de_preprd_data | |
path: ./scraper/artifacts/de_preprd_data.pkl # From artifacts for 2nd stage | |
- uses: actions/upload-artifact@v4 | |
with: | |
name: en_preprd_data | |
path: ./scraper/artifacts/en_preprd_data.pkl # From artifacts for 2nd stage | |
- uses: actions/upload-artifact@v4 | |
with: | |
name: it_preprd_data | |
path: ./scraper/artifacts/it_preprd_data.pkl # From artifacts for 2nd stage | |
- uses: actions/upload-artifact@v4 | |
with: | |
name: fr_preprd_data | |
path: ./scraper/artifacts/fr_preprd_data.pkl # From artifacts for 2nd stage | |
- uses: actions/upload-artifact@v4 | |
with: | |
name: preprd_data_csv | |
path: ./scraper/artifacts/preprd_data.csv | |
- uses: actions/upload-artifact@v4 | |
with: | |
name: data_to_keep | |
path: ./scraper/data/data_to_keep.pkl # From data, for last stage | |
- uses: actions/upload-artifact@v4 | |
with: | |
name: data_to_keep_csv | |
path: ./scraper/data/data_to_keep.csv | |
- name: Push log files to repo | |
uses: github-actions-x/commit@v2.9 | |
with: | |
push-branch: "main" | |
commit-message: "Push logs" | |
force-add: "true" | |
files: ./scraper/tools/ | |
name: autoupdate | |
translate_de: | |
needs: scrape # Wait on the build job to complete | |
runs-on: ubuntu-latest | |
defaults: | |
run: | |
working-directory: ./scraper | |
steps: | |
- name: Checkout the repository | |
uses: actions/checkout@main | |
- name: Increase git buffer size | |
run: | | |
git config --global http.postBuffer 1048576000 | |
- name: Pull latest changes | |
run: | | |
git config pull.rebase true | |
git pull origin main | |
- name: Set up Python 3.9 | |
uses: actions/setup-python@v4 | |
with: | |
python-version: 3.9 | |
- name: Install dependencies | |
run: | | |
python -m pip install --upgrade pip | |
pip install -r requirements.txt | |
- uses: actions/download-artifact@v4 | |
with: | |
path: ./scraper/artifacts/ | |
merge-multiple: true | |
- name: Translate DE | |
run: | | |
LANG_FROM_PIPELINE=${{'de'}} python ./workflow/translate_job.py | |
- uses: actions/upload-artifact@v4 | |
with: | |
name: de_translated | |
path: ./scraper/artifacts/de_translated.pkl | |
- uses: actions/upload-artifact@v4 | |
with: | |
name: de_translated_csv | |
path: ./scraper/artifacts/de_translated.csv | |
translate_en: | |
needs: translate_de # Wait on the translate job to complete | |
runs-on: ubuntu-latest | |
defaults: | |
run: | |
working-directory: ./scraper | |
steps: | |
- name: Checkout the repository | |
uses: actions/checkout@main | |
- name: Increase git buffer size | |
run: | | |
git config --global http.postBuffer 1048576000 | |
- name: Pull latest changes | |
run: | | |
git config pull.rebase true | |
git pull origin main | |
- name: Set up Python 3.9 | |
uses: actions/setup-python@v4 | |
with: | |
python-version: 3.9 | |
- name: Install dependencies | |
run: | | |
python -m pip install --upgrade pip | |
pip install -r requirements.txt | |
- uses: actions/download-artifact@v4 | |
with: | |
path: ./scraper/artifacts/ | |
merge-multiple: true | |
- name: Translate EN | |
run: | | |
LANG_FROM_PIPELINE=${{'en'}} python ./workflow/translate_job.py | |
- uses: actions/upload-artifact@v4 | |
with: | |
name: en_translated.pkl | |
path: ./scraper/artifacts/ | |
- uses: actions/upload-artifact@v4 | |
with: | |
name: en_translated_csv | |
path: ./scraper/artifacts/en_translated.csv | |
translate_fr: | |
needs: translate_en # Wait on the translate job to complete | |
runs-on: ubuntu-latest | |
defaults: | |
run: | |
working-directory: ./scraper | |
steps: | |
- name: Checkout the repository | |
uses: actions/checkout@main | |
- name: Increase git buffer size | |
run: | | |
git config --global http.postBuffer 1048576000 | |
- name: Pull latest changes | |
run: | | |
git config pull.rebase true | |
git pull origin main | |
- name: Set up Python 3.9 | |
uses: actions/setup-python@v4 | |
with: | |
python-version: 3.9 | |
- name: Install dependencies | |
run: | | |
python -m pip install --upgrade pip | |
pip install -r requirements.txt | |
- uses: actions/download-artifact@v4 | |
with: | |
path: ./scraper/artifacts/ | |
merge-multiple: true | |
- name: Translate FR | |
run: | | |
LANG_FROM_PIPELINE=${{'fr'}} python ./workflow/translate_job.py | |
- uses: actions/upload-artifact@v4 | |
with: | |
name: fr_translated | |
path: ./scraper/artifacts/fr_translated.pkl | |
- uses: actions/upload-artifact@v4 | |
with: | |
name: fr_translated_csv | |
path: ./scraper/artifacts/fr_translated.csv | |
translate_it: | |
needs: translate_fr # Wait on the translate job to complete | |
runs-on: ubuntu-latest | |
defaults: | |
run: | |
working-directory: ./scraper | |
steps: | |
- name: Checkout the repository | |
uses: actions/checkout@main | |
- name: Increase git buffer size | |
run: | | |
git config --global http.postBuffer 1048576000 | |
- name: Pull latest changes | |
run: | | |
git config pull.rebase true | |
git pull origin main | |
- name: Set up Python 3.9 | |
uses: actions/setup-python@v4 | |
with: | |
python-version: 3.9 | |
- name: Install dependencies | |
run: | | |
python -m pip install --upgrade pip | |
pip install -r requirements.txt | |
- uses: actions/download-artifact@v4 | |
with: | |
path: ./scraper/artifacts/ | |
merge-multiple: true | |
- name: Translate IT | |
run: | | |
LANG_FROM_PIPELINE=${{'it'}} python ./workflow/translate_job.py | |
- uses: actions/upload-artifact@v4 | |
with: | |
name: it_translated | |
path: ./scraper/artifacts/it_translated.pkl | |
- uses: actions/upload-artifact@v4 | |
with: | |
name: it_translated_csv | |
path: ./scraper/artifacts/it_translated.csv | |
merge: | |
needs: # Disable any languages/translate-jobs you don't need | |
- translate_de | |
- translate_en | |
- translate_it | |
- translate_fr | |
runs-on: ubuntu-latest | |
defaults: | |
run: | |
working-directory: ./scraper | |
steps: | |
- name: Checkout the repository | |
uses: actions/checkout@main | |
- name: Increase git buffer size | |
run: | | |
git config --global http.postBuffer 1048576000 | |
- name: Pull latest changes | |
run: | | |
git config pull.rebase true | |
git pull origin main | |
- name: Set up Python 3.9 | |
uses: actions/setup-python@v4 | |
with: | |
python-version: 3.9 | |
- name: Install dependencies | |
run: | | |
python -m pip install --upgrade pip | |
pip install -r requirements.txt | |
- uses: actions/download-artifact@v4 | |
with: | |
path: ./scraper/artifacts/ | |
merge-multiple: true | |
- uses: actions/download-artifact@v4 | |
with: | |
path: ./scraper/data/ | |
merge-multiple: true | |
- name: Merge Databases | |
run: | | |
python ./workflow/merge_job.py | |
- uses: actions/upload-artifact@v4 | |
with: | |
name: merged database | |
path: ./scraper/data/merged_data.pkl | |
- uses: actions/upload-artifact@v4 | |
with: | |
name: merged database_csv | |
path: ./scraper/data/merged_data.csv | |
- name: Push contents of data folder to repo | |
uses: github-actions-x/commit@v2.9 | |
with: | |
push-branch: "main" | |
commit-message: "Scraper done" | |
force-add: "true" | |
files: ./scraper/data/ | |
name: autoupdate |