Skip to content

Commit

Permalink
cb62672109
Browse files Browse the repository at this point in the history
  • Loading branch information
Liftingthedata committed Jun 29, 2023
1 parent ba8278b commit 6878878
Show file tree
Hide file tree
Showing 9 changed files with 75 additions and 30 deletions.
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,6 @@ style:

git:
@git add . && git commit -m "$$(openssl rand -hex 5)" && git push -u origin main

xd:
@cat manifests/airflow.yaml | envsubst | kubectl apply -f -
4 changes: 2 additions & 2 deletions airflow/dags/scrape_dag.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@
"owner": "airflow",
"start_date": datetime(2022, 12, 1),
"depends_on_past": False,
"retries": 1,
"retry_delay": timedelta(seconds=60),
"retries": 3,
"retry_delay": timedelta(minutes=30),
"concurrency": 2,
"max_active_runs": 1,
"in_cluster": True,
Expand Down
2 changes: 1 addition & 1 deletion airflow/dags/tweet_scrape_dag.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@
"gitsync": True,
"volumes": [COMMON_VOLUME_CONFIG],
},
envs={"start_date": "{{ ds }}", "local_path": LOCAL_PATH, "num_tweets": 10000},
envs={"start_date": "{{ ds }}", "local_path": LOCAL_PATH, "num_tweets": 20000},
)

backfill_first = LatestOnlyOperator(task_id="ensure_backfill_complete")
Expand Down
1 change: 1 addition & 0 deletions manifests/airflow.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ apiVersion: apps/v1
metadata:
name: airflow
spec:
replicas: 2
selector:
matchLabels:
app: airflow
Expand Down
3 changes: 2 additions & 1 deletion readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -154,4 +154,5 @@ At the moment there are 506 games in gamepass ultimate, however the table we cre


sentiment analysis per game
data analysis
data analysis
nlp
5 changes: 4 additions & 1 deletion scrapers/twitter/sentiment_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,9 @@ def scrape_tweets(
)
tweets_list = []
logger.info(f"processing tweets from {since_date} until {until_date}.")
for i, tweet in enumerate(sntwitter.TwitterSearchScraper(query).get_items()):
for i, tweet in enumerate(
sntwitter.TwitterSearchScraper(query, maxEmptyPages=100).get_items()
):
if i >= num_tweets:
break
tweet_dict = {
Expand Down Expand Up @@ -314,6 +316,7 @@ def main(
"shopify",
"playstation",
"ps5",
"ps4",
"nintendo",
"nintendoswitch",
]
Expand Down
28 changes: 25 additions & 3 deletions sql/all_tweets.sql
Original file line number Diff line number Diff line change
@@ -1,8 +1,30 @@
CREATE OR REPLACE TABLE \`stellarismusv5.twitter_data.bq_tweets\`
{# 1 Table for all tweets #}
CREATE OR REPLACE TABLE `stellarismusv5.twitter_data.bq_tweets`
AS
WITH TweetData AS (
SELECT *
FROM \`stellarismusv5.twitter_data.tweets-*\`
FROM `stellarismusv5.twitter_data.tweets-*`
)
SELECT *
FROM TweetData;
FROM TweetData;

{# Hash tag analysis #}
CREATE OR REPLACE TABLE `stellarismusv5.twitter_data.top_hashtags` AS
SELECT
h.item as hashtag, count(*) as frequency
FROM `stellarismusv5.twitter_data.bq_tweets`, UNNEST(Extra_Hashtags.list) as h
group by hashtag
ORDER BY frequency DESC


{# User Influence Analysis #}
CREATE OR REPLACE TABLE `stellarismusv5.twitter_data.user_analysis` AS
SELECT
Username,
COUNT(*) AS tweet_count,
SUM(Likes) AS total_likes,
SUM(Retweets) AS total_retweets,
SUM(Followers) AS total_followers
FROM `stellarismusv5.twitter_data.bq_tweets`
GROUP BY Username;

37 changes: 37 additions & 0 deletions sql/tweets.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
CREATE OR REPLACE TABLE `stellarismusv5.twitter_data.top_hashtags` AS
SELECT
h.item as hashtag, count(*) as frequency
FROM `stellarismusv5.twitter_data.bq_tweets`, UNNEST(Extra_Hashtags.list) as h
group by hashtag
ORDER BY frequency DESC

CREATE OR REPLACE TABLE `stellarismusv5.twitter_data.user_analysis` AS
SELECT
User_Id,
COUNT(*) AS tweet_count,
SUM(Likes) AS total_likes,
SUM(Retweets) AS total_retweets,
SUM(Followers) AS total_followers
FROM `stellarismusv5.twitter_data.user_analysis`
GROUP BY User_Id;


bq query --nouse_legacy_sql \
"CREATE OR REPLACE TABLE \`stellarismusv5.twitter_data.top_hashtags\` AS
SELECT
h.item as hashtag, count(*) as frequency
FROM `stellarismusv5.twitter_data.bq_tweets`, UNNEST(Extra_Hashtags.list) as h
group by hashtag
ORDER BY frequency DESC;
;
CREATE OR REPLACE TABLE \`stellarismusv5.twitter_data.user_analysis\` AS
SELECT
Username,
COUNT(*) AS tweet_count,
SUM(Likes) AS total_likes,
SUM(Retweets) AS total_retweets,
SUM(Followers) AS total_followers
FROM \`stellarismusv5.twitter_data.bq_tweets\`
GROUP BY Username;
"
22 changes: 0 additions & 22 deletions xd.json

This file was deleted.

0 comments on commit 6878878

Please sign in to comment.