# YouTube Crawler

In this notebook, we will collect comments from YouTube videos.

In [1]:
import sys
from pathlib import Path

if str(Path(".").absolute().parent) not in sys.path:
    sys.path.append(str(Path(".").absolute().parent.parent))

In [2]:
from dotenv import load_dotenv

# Load .env file
load_dotenv()

True

In [3]:
import datetime
from tqdm import tqdm
from typing import List
from src.logs import setup_logger
from src.s3 import Uploader
from src.anonymize import Anonymizer
from src.checker import CommentChecker
from src.data_models import RawText
from src.settings import AppSettings
from src.perspective import PerspectiveAPI
from src.socials.youtube import YouTubeCrawler
from src.utils import (
    read_yaml,
    label_studio_fmt
)

_logger = setup_logger(AppSettings().LOG_LEVEL)

In [4]:
def calc_toxic_rate(data):
    toxic_qty = len([i for i in data if i.is_toxic and i.toxicity_score])
    non_toxic_qty = len([i for i in data if i.is_toxic == False and i.toxicity_score])

    print(f"Toxic comments: {toxic_qty}")
    print(f"Non-toxic comments: {non_toxic_qty}")
    print(f"Toxic comments rate: {toxic_qty / (toxic_qty+non_toxic_qty):.4f}")

def remove_duplicates(data: List[RawText]):
    """Remove duplicates RawText objects.

    Args:
    - data: list of RawText objects

    Returns:
    - data: list of RawText objects without duplicates
    """
    new_data = []
    unique_texts = []
    for raw in data:
        if raw.text not in unique_texts:
            new_data.append(raw)
            unique_texts.append(raw.text)
    return new_data

In [5]:
args = AppSettings()

args.YOUTUBE_MAX_COMMENTS_PER_VIDEO = 50000

params = read_yaml("../../properties/application.yaml")

In [6]:
videos = params["youtube"]
_logger.info(f"{len(videos)} videos to process.")

2022-04-10 08:52:04,537 :: INFO :: 2285299911 :: <module> :: 22 videos to process.


## Collecting comments by video ID

In [7]:
youtube = YouTubeCrawler(api_key=args.YOUTUBE_API_KEY)

In [22]:
data = []

_logger.info("Retrieving comments from Youtube.")

for video in videos:
    tmp_comments = youtube.get_video_comments(
        youtube._get_video_id(video["video"]),
        max_results_per_call=args.YOUTUBE_MAX_COMMENTS,
        max_comments=args.YOUTUBE_MAX_COMMENTS_PER_VIDEO)
        
    for comment in tmp_comments:
        comment.publisher_category = video["category"]

    data.extend(tmp_comments)

data = remove_duplicates(data)

_logger.info(f"{len(data)} unique comments collected. YouTube API calls: {youtube.call_counter}.")

2022-04-09 08:02:55,803 :: INFO :: 2795302194 :: <module> :: Retrieving comments from Youtube.


Collected 42297 comments for video_id: yB_NsC5ZcLo
Collected 1604 comments for video_id: gcZD-FEwQXo
Collected 1943 comments for video_id: 016FNgvHSTU
Collected 5154 comments for video_id: TRU4v-iCTDA
Collected 968 comments for video_id: e3L9403q7GU
Collected 2827 comments for video_id: YpM-NVvNjUE
Collected 895 comments for video_id: 7PUdcYpt_ZY
Collected 4691 comments for video_id: MO3bKKEm1b0
Collected 9534 comments for video_id: 0nGSGh-PVZs
Collected 32271 comments for video_id: AS5PT-vCFts
Collected 9787 comments for video_id: vKnubjHodrk
Collected 3008 comments for video_id: 5taqYmap2kk
Collected 10653 comments for video_id: 3tfzd-D-6w0
Collected 50036 comments for video_id: 2Lp7XO6oWCM
Collected 39794 comments for video_id: -cO2D_ZHzPY
Collected 6579 comments for video_id: SDCtgJjnxGQ
Collected 5724 comments for video_id: RyV-R1l3erw
Collected 22643 comments for video_id: zgGHIirlBWU
Collected 1491 comments for video_id: FFtCvOy2jZE
Collected 1915 comments for video_id: A_zDek8P

2022-04-09 08:46:01,733 :: INFO :: 2795302194 :: <module> :: 315993 unique comments collected. YouTube API calls: 539.


## Remove unaccepted comments

- empty comments
- comments with more than 1000 characters

In [34]:
cc = CommentChecker()

data = [i for i in data if not cc.is_empty(i.text) and cc.has_acceptable_length(i.text)]

_logger.info(f"{len(data)} comments after removing empty.")

2022-04-09 09:05:58,451 :: INFO :: 3597144365 :: <module> :: 315991 comments after removing empty.


## Perspective API

In [14]:
if not isinstance(args.PERSPECTIVE_API_KEY, str):
    _logger.info("Perspective API key not set.")
    raise AttributeError("Perspective API key not set.")

_logger.info("Starting toxicity prediction.")

perspective = PerspectiveAPI(apikey=args.PERSPECTIVE_API_KEY)

wip_counter = 0
with tqdm(total=len(data)) as pbar:
    pbar.set_description("Predicting toxicity")
    for item in data:
        if item.toxicity_score is None:
            try:
                response = perspective.predict(text=item.text)
                if isinstance(response.get("TOXICITY"), float):
                    item.toxicity_score = response.get("TOXICITY")
                    if item.toxicity_score > args.PERSPECTIVE_THRESHOLD:
                        item.is_toxic = True
                    else:
                        item.is_toxic = False
                
                # Update progress bar with the work in progress
                if wip_counter > 0:
                    pbar.update(wip_counter)
                    wip_counter = 0                

                pbar.update(1)
            except Exception as e:
                _logger.error(f"Error predicting toxicity for {item.dict()}.")
                raise e
        else:
            wip_counter += 1

toxic_rate = len([i for i in data if i.is_toxic]) / len(data)

_logger.info(f"Toxic comments rate: {toxic_rate:.4f}")

if args.FILTER_TOXIC_COMMENTS:
    data = [item for item in data if item.is_toxic]

_logger.info(f"Total comments: {len(data)}")

2022-04-10 00:36:13,671 :: INFO :: 745123290 :: <module> :: Starting toxicity prediction.
Predicting toxicity:   0%|          | 0/315991 [00:00<?, ?it/s]

In the cell below, we will extract some metrics from the toxicity of the comments.

## Filter comments by toxicity

In [10]:
data = [item for item in data if item.toxicity_score and item.is_toxic]

_logger.info(f"Total toxic comments: {len(data)}")

2022-04-10 08:53:21,916 :: INFO :: 512101324 :: <module> :: Total toxic comments: 144958


## Anonymization

In [11]:
_logger.info("Starting anonymization.")

anonymizer = Anonymizer()

with tqdm(total=len(data)) as pbar:
    pbar.set_description("Anonymization")
    for item in data:
        item.text = anonymizer.apply_all(item.text)
        pbar.update(1)

_logger.info("Anonymization finished.")

2022-04-10 08:53:44,695 :: INFO :: 3130945675 :: <module> :: Starting anonymization.
Anonymization: 100%|██████████| 144958/144958 [19:51<00:00, 121.61it/s]
2022-04-10 09:13:41,336 :: INFO :: 3130945675 :: <module> :: Anonymization finished.


## Upload to S3

In [12]:
_logger.info("Uploading data to S3.")

key = datetime.datetime.now().strftime("%Y-%m-%d") + ".json"

# Convert our data to dicts in Label Studio format
data = [label_studio_fmt(i) for i in data]

uploader = Uploader(bucket=args.AWS_S3_BUCKET,
                    bucket_prefix=args.AWS_S3_BUCKET_PREFIX)

if isinstance(args.AWS_ROLE_ARN, str):
    uploader.upload_sts(role_arn=args.AWS_ROLE_ARN,
                        session_name="ToxicityDetectionCrawler",
                        key=key, data=data)
elif isinstance(args.AWS_ACCESS_KEY_ID, str) and isinstance(args.AWS_SECRET_ACCESS_KEY, str):
    uploader.upload_aksk(access_key=args.AWS_ACCESS_KEY_ID,
                            secret_key=args.AWS_SECRET_ACCESS_KEY,
                            key=key, data=data)
else:
    _logger.error("AWS credentials not set.")
    raise AttributeError("AWS credentials not set.")

_logger.info("Data uploaded to S3.")

2022-04-10 09:14:00,599 :: INFO :: 887248544 :: <module> :: Uploading data to S3.
2022-04-10 09:15:30,441 :: INFO :: 887248544 :: <module> :: Data uploaded to S3.
