# Twitter Crawler

This notebook was used to develop the Twitter Crawler.

In [1]:
import sys
from pathlib import Path

if str(Path(".").absolute().parent) not in sys.path:
    sys.path.append(str(Path(".").absolute().parent.parent))

In [2]:
from dotenv import load_dotenv

# Load .env file
load_dotenv()

True

In [3]:
import datetime
import pandas as pd
from tqdm import tqdm
from ast import literal_eval
from kaggle.api.kaggle_api_extended import KaggleApi
from src.logs import setup_logger
from src.s3 import Uploader
from src.anonymize import Anonymizer
from src.settings import AppSettings
from src.perspective import PerspectiveAPI
from src.socials.twitter import TwitterCrawler
from src.utils import (
    read_yaml,
    get_toxic_substrings,
    label_studio_fmt
)

_logger = setup_logger(AppSettings().LOG_LEVEL)


In [4]:
args = AppSettings()
params = read_yaml("../../properties/application.yaml")

In [5]:
twitter = TwitterCrawler(
    consumer_key=args.TWITTER_CONSUMER_KEY,
    consumer_secret=args.TWITTER_CONSUMER_SECRET,
    access_token=args.TWITTER_ACCESS_TOKEN,
    access_token_secret=args.TWITTER_ACCESS_TOKEN_SECRET
)

## Collect tweets and replies

### by user

In this section, we will collect tweets and replies from a list of users.

In [6]:
profiles = params["twitter"]
print(f"Profiles count: {len(profiles)}")

Profiles count: 61


In [36]:
data = []

In [7]:
_logger.info("Getting tweets by profiles.")

for profile in profiles:
    _logger.info(f"Crawling {profile.get('name')} from Twitter.")

    try:
        tweets = twitter.get_tweets(profile["name"],
                                    max_count=args.TWITTER_MAX_TWEETS)

        _logger.debug({"tweets type": type(tweets),
                    "tweets size": len(tweets)})

        # Add Publisher Category
        for tweet in tweets:
            tweet.publisher_category = profile.get("category")

        data += tweets

        for tweet in tweets:
            _logger.debug(f"{profile.get('name')} - Getting replies for tweet: {tweet.text}")

            try:
                replies = twitter.get_replies(username=profile["name"],
                                            tweet_id=tweet.id,
                                            max_count=args.TWITTER_MAX_TWEETS)

                _logger.debug({"replies type": type(tweets),
                            "replies size": len(tweets)})

                # Add Publisher Category
                for reply in replies:
                    reply.publisher_category = profile.get("category")

                data += replies
                
            except Exception as e:
                _logger.error(f"Error getting replies from Twitter: {tweet} - Exception: {e}")
                continue

    except Exception as e:
        _logger.error(f"Error getting tweets from Twitter: {profile.get('name')} - Exception: {e}")
        continue

_logger.info(f"Twitter crawler finished. {len(data)} tweets found.")


2022-03-26 20:29:50,291 :: INFO :: 2413634140 :: <module> :: Crawling jairbolsonaro from Twitter.
2022-03-26 20:29:54,710 :: DEBUG :: 2413634140 :: <module> :: {'tweets type': <class 'list'>, 'tweets size': 500}
2022-03-26 20:29:54,710 :: DEBUG :: 2413634140 :: <module> :: jairbolsonaro - Getting replies for tweet: @rogeriosmarinho 🇧🇷, Ministro!
2022-03-26 20:30:01,118 :: DEBUG :: 2413634140 :: <module> :: {'replies type': <class 'list'>, 'replies size': 500}
2022-03-26 20:30:01,119 :: DEBUG :: 2413634140 :: <module> :: jairbolsonaro - Getting replies for tweet: - Elas possibilitam a prática de 13 modalidades olímpicas e seis paralímpicas, estimulando principalmente os jovens, oferecendo-lhes novos objetivos, além de possibilitar que não caiam em mundos errados e sem futuro.

- Detalhes: https://t.co/eVxKAQhS9M


### by keyword

In [7]:
print("Downloading data from Kaggle")

kaggle = KaggleApi()
kaggle.authenticate()
kaggle.dataset_download_file(dataset="olidbr", file_name="olidbr.csv")

olidbr = pd.read_csv("olidbr.csv")

olidbr.toxic_spans = olidbr.toxic_spans.apply(lambda x: literal_eval(x) if pd.notnull(x) else x)

Path("olidbr.csv").unlink()

print(f"Shape: {olidbr.shape}")
olidbr.head()

Downloading data from Kaggle
Shape: (706, 17)


Unnamed: 0,id,text,is_offensive,is_targeted,targeted_type,toxic_spans,health,ideology,insult,lgbtqphobia,other_lifestyle,physical_aspects,profanity_obscene,racism,religious_intolerance,sexism,xenophobia
0,6391e3dbb12848ac8ef6131edf2d69f9,USER Canalha URL,OFF,TIN,IND,"[5, 6, 7, 8, 9, 10, 11, 12]",False,False,True,False,False,False,False,False,False,False,False
1,5a21508c2afc4c3295137451baf8b981,USER VTNSC FDP,OFF,TIN,IND,"[5, 6, 7, 8, 9, 10, 11, 12, 13]",False,False,True,False,False,False,True,False,False,False,False
2,060b0390f99041cd8cd38ca5a2b66907,USER O partido do Kim votou à favor do fundão....,OFF,TIN,IND,,False,False,True,False,False,False,False,False,False,False,False
3,3264fc0b06974c2daf423a9e5998d003,USER Grande homem. Em um país de covardes. Ete...,OFF,TIN,GRP,"[34, 35, 36, 37, 38, 39, 40, 41]",False,False,True,False,False,False,False,False,False,False,False
4,002bced1b7444833bbe5e054c5aa6512,USER USER QAnon Brasil é um bicho estranho. Ob...,OFF,TIN,IND,"[28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 3...",False,False,True,False,False,False,True,False,False,False,False


In [16]:
keywords = []

for row in olidbr.to_dict(orient="records"):
    if isinstance(row.get("toxic_spans"), list) and len(row.get("toxic_spans")) > 0:
        keywords.extend(get_toxic_substrings(row["text"], row["toxic_spans"]))

keywords = [w.strip() for w in keywords]

In [17]:
# Get toxic keywords from GitHub
## https://github.com/JAugusto97/ToLD-Br/blob/main/experiments/toxic_keywords.txt
import requests

told_kwds = requests.get("https://raw.githubusercontent.com/JAugusto97/ToLD-Br/main/experiments/toxic_keywords.txt").text.split("\n")

for line in told_kwds:
    # Remove start until ':'
    if line != '':
        line = line.split(":")[1]
        line = line.split(",")
        for word in line:
            if word.strip() not in keywords:
                keywords.append(word.strip())
                
told_kwds

['homofobico:viado,veado,viadinho,veadinho,viadao,veadao,bicha,bixa,bichinha,bixinha,bichona,bixona,baitola,sapatão,sapatao,traveco,bambi,biba,boiola,marica,gayzão,gayzao,flor,florzinha',
 'obsceno:vagabundo,vagaba,desgraçada,desgraçado,desgracado,arrombado,arrombada,foder,fuder,fudido,fodido,cú,cu,pinto,pau,pal,caralho,caraio,carai,pica,cacete,rola,porra,escroto,buceta,fdp,pqp,vsf,tnc,vtnc,puto,putinho',
 'insulto:acéfalo,acefalo,burro,idiota,trouxa,estúpido,estupido,estúpida,canalha,demente,retardado,retardada,verme,maldito,maldita,ridículo,ridiculo,ridícula,ridicula,morfético,morfetico,morfética,morfetica,lazarento,lazarenta,lixo,mongolóide,mongoloide,mongol,asqueroso,asquerosa,cretino,cretina,babaca,pilantra',
 'racismo:neguinho,neguinha,pretinho,pretinha,escurinho,escurinha,pretinha,pretinho,crioulo,criolo,crioula,criola,macaco,macaca,gorila',
 'misogino:puta,vagabunda,vagaba,mulherzinha,piranha,feminazi,putinha,piriguete,vaca,putinha',
 'xenofobia:bahiano,baiano,baianagem,xinglin

In [19]:
print(f"Toxic keywords: {len(keywords)}")

Toxic keywords: 471


In [20]:
data_kwd = []
collected_keywords = []

In [30]:
def contains_alpha_chars(s):
    return any(c.isalpha() for c in s)
    
_logger.info(f"Crawling tweets from keywords.")

with tqdm(total=len(keywords)) as pbar:
    for keyword in keywords:
        if keyword not in collected_keywords and contains_alpha_chars(keyword):
            pbar.set_description(f"Crawling {keyword}")
            try:
                tmp = twitter.get_tweets_by_keyword(
                    keyword=keyword,
                    max_count=args.TWITTER_MAX_TWEETS)
                data_kwd += tmp
                collected_keywords.append(keyword)
            except:
                pass
        pbar.update(1)

_logger.info(f"{len(data_kwd)} tweets found.")

2022-03-27 10:29:15,012 :: INFO :: 3671207448 :: <module> :: Crawling tweets from keywords.
100%|██████████| 471/471 [00:00<00:00, 231759.41it/s]
2022-03-27 10:29:15,019 :: INFO :: 3671207448 :: <module> :: 155968 tweets found.


## Perspective API

In [12]:
data += data_kwd

_logger.info(f"Total tweets: {len(data)}")

2022-03-28 11:33:41,591 :: INFO :: 859693945 :: <module> :: Total tweets: 155968


In [9]:
if not isinstance(args.PERSPECTIVE_API_KEY, str):
    _logger.info("Perspective API key not set.")
    raise AttributeError("Perspective API key not set.")

_logger.info("Starting toxicity prediction.")

perspective = PerspectiveAPI(apikey=args.PERSPECTIVE_API_KEY)

with tqdm(total=len(data)) as pbar:
    pbar.set_description("Toxicity prediction")
    for item in data:
        if item.toxicity_score is None:
            response = perspective.predict(text=item.text)
            if isinstance(response.get("TOXICITY"), float):
                item.toxicity_score = response.get("TOXICITY")
                if item.toxicity_score > args.PERSPECTIVE_THRESHOLD:
                    item.is_toxic = True
                else:
                    item.is_toxic = False
        pbar.update(1)

toxic_rate = len([i for i in data if i.is_toxic]) / len(data)

_logger.info(f"Toxic comments rate: {toxic_rate:.4f}")

if args.FILTER_TOXIC_COMMENTS:
    data = [item for item in data if item.is_toxic]

_logger.info(f"Total comments: {len(data)}")

2022-03-28 11:32:20,598 :: INFO :: 3371012580 :: <module> :: Starting toxicity prediction.
2022-03-28 11:32:20,599 :: INFO :: 3371012580 :: <module> :: Toxic comments rate: 0.7429
2022-03-28 11:32:20,600 :: INFO :: 3371012580 :: <module> :: Total comments: 115871


## Anonymization

In [10]:
_logger.info("Starting anonymization.")

anonymizer = Anonymizer()

for item in data:
    item.text = anonymizer.apply_all(item.text)

_logger.info("Anonymization finished.")

2022-03-28 11:33:05,638 :: INFO :: 495907151 :: <module> :: Starting anonymization.
2022-03-28 11:33:05,639 :: INFO :: 495907151 :: <module> :: Anonymization finished.


## Upload to S3

In [11]:
_logger.info("Uploading data to S3.")

key = datetime.datetime.now().strftime("%Y-%m-%d") + ".json"

# Convert our data to dicts in Label Studio format
data = [label_studio_fmt(i) for i in data]

uploader = Uploader(bucket=args.AWS_S3_BUCKET,
                    bucket_prefix=args.AWS_S3_BUCKET_PREFIX)

if isinstance(args.AWS_ROLE_ARN, str):
    uploader.upload_sts(role_arn=args.AWS_ROLE_ARN,
                        session_name="ToxicityDetectionCrawler",
                        key=key, data=data)
elif isinstance(args.AWS_ACCESS_KEY_ID, str) and isinstance(args.AWS_SECRET_ACCESS_KEY, str):
    uploader.upload_aksk(access_key=args.AWS_ACCESS_KEY_ID,
                            secret_key=args.AWS_SECRET_ACCESS_KEY,
                            key=key, data=data)
else:
    _logger.error("AWS credentials not set.")
    raise AttributeError("AWS credentials not set.")

_logger.info("Data uploaded to S3.")

2022-03-28 11:33:14,789 :: INFO :: 3455279744 :: <module> :: Uploading data to S3.
2022-03-28 11:33:14,791 :: INFO :: 3455279744 :: <module> :: Data uploaded to S3.
