-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Liftingthedata
committed
Jun 28, 2023
1 parent
bb5e5d9
commit 1e322c6
Showing
8 changed files
with
540 additions
and
314 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
#!/bin/bash | ||
# the script performs the following tasks: | ||
# 1. Copies Parquet twitter data files to GCP. | ||
# 2. Loads data from Twitter Parquet files into BigQuery | ||
|
||
|
||
LOCAL_DIR=$LOCAL_DIR | ||
echo "$LOCAL_DIR - $DATA_BUCKET" | ||
echo "uploading files to GCP bucket..." | ||
gsutil -m cp $LOCAL_DIR/tweets-*.parquet gs://${DATA_BUCKET}/twitter/ | ||
|
||
|
||
|
||
echo "Loading twitter data" | ||
for file in $(find $LOCAL_DIR -type f -name 'tweets-*.parquet'); do | ||
TWITTER_DATASET=${TWITTER_DATASET} | ||
# Extract the table name from the filename | ||
table=$(basename $file .parquet) | ||
# Check if the table exists in BigQuery | ||
exists=$(bq query --use_legacy_sql=false \ | ||
--format=json \ | ||
--max_rows=1 \ | ||
"SELECT COUNT(*) as table_exists \ | ||
FROM \`$TWITTER_DATASET.INFORMATION_SCHEMA.TABLES\` \ | ||
WHERE table_name = '$table' \ | ||
AND table_type IN ('TABLE', 'BASE TABLE')" | sed -n 's/.*"table_exists":"\([^"]*\)".*/\1/p') | ||
echo $exists | ||
if [ $exists -eq 0 ]; then | ||
# Create the BigQuery table | ||
bq load --autodetect --source_format=PARQUET $TWITTER_DATASET.$table $file >/dev/null 2>&1 | ||
else | ||
echo "Table $table already exists, skipping" | ||
fi | ||
done | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
""" | ||
This DAG utilizes the KubernetesJobOperator to execute scripts as Kubernetes jobs. The | ||
primary purpose of these jobs is to perform scraping tasks. | ||
The DAG follows the sequence: | ||
twitter_task >> backfill_first >> metacritic_tg >> vgchartz_tg >> gcp_task | ||
Task 'twitter_task': This task involves scraping tweets from Twitter for the previous | ||
month and performing sentiment analysis on them. | ||
Task 'backfill_first': This task ensures that the Twitter data is backfilled before | ||
scraping other sites that do not require backfilling. | ||
Task group 'metacritic_tg': This task group consists of multiple tasks that scrape data | ||
from Metacritic. It scrapes the data for each game as well as the user and critic reviews. | ||
Task group 'vgchartz_tg': This task group consists of 2 tasks that scrape data | ||
from Vgchartz. | ||
Task 'gcp_task': This final task saves the scraped data to a Google Cloud Storage (GCS) | ||
bucket and subsequently loads it into a BigQuery table. | ||
The DAG is scheduled to run on a cron schedule, specifically on the first day of each | ||
month. The Twitter data is appended during each run, while the other data is replaced with | ||
the latest version. | ||
""" | ||
# pylint: disable=pointless-statement | ||
# pylint: disable=wrong-import-order | ||
|
||
import os | ||
import sys | ||
from datetime import datetime, timedelta | ||
|
||
from airflow import DAG | ||
from airflow.operators.latest_only import LatestOnlyOperator | ||
from airflow_kubernetes_job_operator.kubernetes_job_operator import ( | ||
KubernetesJobOperator, | ||
) | ||
|
||
sys.path.insert(0, os.path.abspath(os.path.dirname(__file__))) | ||
|
||
default_args = { | ||
"owner": "airflow", | ||
"start_date": datetime(2022, 12, 1), | ||
"depends_on_past": False, | ||
"retries": 1, | ||
"retry_delay": timedelta(seconds=60), | ||
"concurrency": 0, | ||
# "max_active_runs": 1, | ||
"in_cluster": True, | ||
"random_name_postfix_length": 3, | ||
"name_prefix": "", | ||
# "max_active_tasks_per_dag": 4, | ||
} | ||
|
||
|
||
today = datetime.today().strftime("%Y-%m-%d") | ||
POD_TEMPALTE = os.path.join(os.path.dirname(__file__), "templates", "pod_template.yaml") | ||
BASE = "/git/repo/scrapers" | ||
GOOGLE_CLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT") | ||
COMMON_VOLUME_CONFIG = { | ||
"name": "persistent-volume", | ||
"type": "persistentVolumeClaim", | ||
"reference": "data-pv-claim", | ||
"mountPath": "/etc/scraped_data/", | ||
} | ||
LOCAL_PATH = "/etc/scraped_data/" | ||
|
||
with DAG( | ||
dag_id="twitter_scraper", | ||
schedule_interval="0 0 1 * *", | ||
default_args=default_args, | ||
catchup=True, | ||
tags=["scraping", "twitter"], | ||
) as dag: | ||
twitter_task = KubernetesJobOperator( | ||
task_id="scrape-tweets", | ||
body_filepath=POD_TEMPALTE, | ||
command=["python", f"{BASE}/twitter/sentiment_analysis.py"], | ||
jinja_job_args={ | ||
"image": f"eu.gcr.io/{GOOGLE_CLOUD_PROJECT}/scraper:latest", | ||
"name": "scrape-tweets", | ||
"gitsync": True, | ||
"volumes": [COMMON_VOLUME_CONFIG], | ||
}, | ||
envs={"start_date": "{{ ds }}", "local_path": LOCAL_PATH, "num_tweets": 10000}, | ||
) | ||
|
||
backfill_first = LatestOnlyOperator(task_id="ensure_backfill_complete") | ||
|
||
gcp_task = KubernetesJobOperator( | ||
task_id="load_to_gcp", | ||
body_filepath=POD_TEMPALTE, | ||
command=["/bin/bash", "/git/repo/airflow/dags/scripts/twitter_gcp_script.sh"], | ||
jinja_job_args={ | ||
"image": "google/cloud-sdk:alpine", | ||
"name": "ingest-and-load-to-bq", | ||
"gitsync": True, | ||
"volumes": [COMMON_VOLUME_CONFIG], | ||
}, | ||
envs={ | ||
"LOCAL_DIR": LOCAL_PATH, | ||
"TWITTER_DATASET": os.getenv("TWITTER_DATASET"), | ||
"DATA_BUCKET": os.getenv("DATA_BUCKET"), | ||
"PROJECT": GOOGLE_CLOUD_PROJECT, | ||
}, | ||
) | ||
twitter_task >> backfill_first >> gcp_task |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
CREATE OR REPLACE TABLE \`stellarismusv5.twitter_data.bq_tweets\` | ||
AS | ||
WITH TweetData AS ( | ||
SELECT * | ||
FROM \`stellarismusv5.twitter_data.tweets-*\` | ||
) | ||
SELECT * | ||
FROM TweetData; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
"CREATE OR REPLACE TABLE \`${PROJECT}.${METACRITIC_DATASET}.bq_metacritic_genre_data\` AS | ||
WITH GenreData AS ( | ||
SELECT TRIM(genre) AS genre, | ||
AVG(meta_score) AS average_meta_score, | ||
AVG(user_score) AS average_user_score, | ||
COUNT(*) AS game_count | ||
FROM \`${PROJECT}.${METACRITIC_DATASET}.bq_metacritic_gamedata\`, UNNEST(SPLIT(genre, ',')) AS genre | ||
GROUP BY genre | ||
) | ||
SELECT * | ||
FROM GenreData;" |
Oops, something went wrong.