Skip to content

Run scraper and preprocessing #35

Run scraper and preprocessing

Run scraper and preprocessing #35

Workflow file for this run

# This is copy of the workflow file run by github actions
name: Run scraper and preprocessing
on:
schedule:
- cron: "0 0 * * 0"
# Allows you to run this workflow manually from the Actions tab
workflow_dispatch:
jobs:
scrape:
# cf. https://docs.github.com/en/github/setting-up-and-managing-billing-and-payments-on-github/about-billing-for-github-actions
runs-on: ubuntu-latest
defaults:
run:
working-directory: ./scraper
steps:
- name: Checkout the repository
uses: actions/checkout@main
- name: Increase git buffer size
run: |
git config --global http.postBuffer 1048576000
- name: Pull latest changes
run: |
git config pull.rebase true
git pull origin main
- name: Set up Python 3.9
uses: actions/setup-python@v4
with:
python-version: 3.9
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
- name: Download spacy models
run: |
python -m spacy download en_core_web_sm
python -m spacy download fr_core_news_sm
python -m spacy download de_core_news_sm
python -m spacy download it_core_news_sm
- name: Download nltk stopwords
run: |
python -m nltk.downloader all
- name: Run Python script
run: |
python scraper.py
env: # Set the secrets as env var
CLIENT_SECRET: ${{ secrets.CLIENT_SECRET }}
- uses: actions/upload-artifact@v4
with:
name: de_preprd_data
path: ./scraper/artifacts/de_preprd_data.pkl # From artifacts for 2nd stage
- uses: actions/upload-artifact@v4
with:
name: en_preprd_data
path: ./scraper/artifacts/en_preprd_data.pkl # From artifacts for 2nd stage
- uses: actions/upload-artifact@v4
with:
name: it_preprd_data
path: ./scraper/artifacts/it_preprd_data.pkl # From artifacts for 2nd stage
- uses: actions/upload-artifact@v4
with:
name: fr_preprd_data
path: ./scraper/artifacts/fr_preprd_data.pkl # From artifacts for 2nd stage
- uses: actions/upload-artifact@v4
with:
name: preprd_data_csv
path: ./scraper/artifacts/preprd_data.csv
- uses: actions/upload-artifact@v4
with:
name: data_to_keep
path: ./scraper/data/data_to_keep.pkl # From data, for last stage
- uses: actions/upload-artifact@v4
with:
name: data_to_keep_csv
path: ./scraper/data/data_to_keep.csv
- name: Push log files to repo
uses: github-actions-x/commit@v2.9
with:
push-branch: "main"
commit-message: "Push logs"
force-add: "true"
files: ./scraper/tools/
name: autoupdate
translate_de:
needs: scrape # Wait on the build job to complete
runs-on: ubuntu-latest
defaults:
run:
working-directory: ./scraper
steps:
- name: Checkout the repository
uses: actions/checkout@main
- name: Increase git buffer size
run: |
git config --global http.postBuffer 1048576000
- name: Pull latest changes
run: |
git config pull.rebase true
git pull origin main
- name: Set up Python 3.9
uses: actions/setup-python@v4
with:
python-version: 3.9
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
- uses: actions/download-artifact@v4
with:
path: ./scraper/artifacts/
merge-multiple: true
- name: Translate DE
run: |
LANG_FROM_PIPELINE=${{'de'}} python ./workflow/translate_job.py
- uses: actions/upload-artifact@v4
with:
name: de_translated
path: ./scraper/artifacts/de_translated.pkl
- uses: actions/upload-artifact@v4
with:
name: de_translated_csv
path: ./scraper/artifacts/de_translated.csv
translate_en:
needs: translate_de # Wait on the translate job to complete
runs-on: ubuntu-latest
defaults:
run:
working-directory: ./scraper
steps:
- name: Checkout the repository
uses: actions/checkout@main
- name: Increase git buffer size
run: |
git config --global http.postBuffer 1048576000
- name: Pull latest changes
run: |
git config pull.rebase true
git pull origin main
- name: Set up Python 3.9
uses: actions/setup-python@v4
with:
python-version: 3.9
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
- uses: actions/download-artifact@v4
with:
path: ./scraper/artifacts/
merge-multiple: true
- name: Translate EN
run: |
LANG_FROM_PIPELINE=${{'en'}} python ./workflow/translate_job.py
- uses: actions/upload-artifact@v4
with:
name: en_translated.pkl
path: ./scraper/artifacts/
- uses: actions/upload-artifact@v4
with:
name: en_translated_csv
path: ./scraper/artifacts/en_translated.csv
translate_fr:
needs: translate_en # Wait on the translate job to complete
runs-on: ubuntu-latest
defaults:
run:
working-directory: ./scraper
steps:
- name: Checkout the repository
uses: actions/checkout@main
- name: Increase git buffer size
run: |
git config --global http.postBuffer 1048576000
- name: Pull latest changes
run: |
git config pull.rebase true
git pull origin main
- name: Set up Python 3.9
uses: actions/setup-python@v4
with:
python-version: 3.9
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
- uses: actions/download-artifact@v4
with:
path: ./scraper/artifacts/
merge-multiple: true
- name: Translate FR
run: |
LANG_FROM_PIPELINE=${{'fr'}} python ./workflow/translate_job.py
- uses: actions/upload-artifact@v4
with:
name: fr_translated
path: ./scraper/artifacts/fr_translated.pkl
- uses: actions/upload-artifact@v4
with:
name: fr_translated_csv
path: ./scraper/artifacts/fr_translated.csv
translate_it:
needs: translate_fr # Wait on the translate job to complete
runs-on: ubuntu-latest
defaults:
run:
working-directory: ./scraper
steps:
- name: Checkout the repository
uses: actions/checkout@main
- name: Increase git buffer size
run: |
git config --global http.postBuffer 1048576000
- name: Pull latest changes
run: |
git config pull.rebase true
git pull origin main
- name: Set up Python 3.9
uses: actions/setup-python@v4
with:
python-version: 3.9
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
- uses: actions/download-artifact@v4
with:
path: ./scraper/artifacts/
merge-multiple: true
- name: Translate IT
run: |
LANG_FROM_PIPELINE=${{'it'}} python ./workflow/translate_job.py
- uses: actions/upload-artifact@v4
with:
name: it_translated
path: ./scraper/artifacts/it_translated.pkl
- uses: actions/upload-artifact@v4
with:
name: it_translated_csv
path: ./scraper/artifacts/it_translated.csv
merge:
needs: # Disable any languages/translate-jobs you don't need
- translate_de
- translate_en
- translate_it
- translate_fr
runs-on: ubuntu-latest
defaults:
run:
working-directory: ./scraper
steps:
- name: Checkout the repository
uses: actions/checkout@main
- name: Increase git buffer size
run: |
git config --global http.postBuffer 1048576000
- name: Pull latest changes
run: |
git config pull.rebase true
git pull origin main
- name: Set up Python 3.9
uses: actions/setup-python@v4
with:
python-version: 3.9
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
- uses: actions/download-artifact@v4
with:
path: ./scraper/artifacts/
merge-multiple: true
- uses: actions/download-artifact@v4
with:
path: ./scraper/data/
merge-multiple: true
- name: Merge Databases
run: |
python ./workflow/merge_job.py
- uses: actions/upload-artifact@v4
with:
name: merged database
path: ./scraper/data/merged_data.pkl
- uses: actions/upload-artifact@v4
with:
name: merged database_csv
path: ./scraper/data/merged_data.csv
- name: Push contents of data folder to repo
uses: github-actions-x/commit@v2.9
with:
push-branch: "main"
commit-message: "Scraper done"
force-add: "true"
files: ./scraper/data/
name: autoupdate