# YouTube Crawler

In this notebook, we will collect comments from YouTube videos.

In [1]:
import sys
from pathlib import Path

if str(Path(".").absolute().parent) not in sys.path:
    sys.path.append(str(Path(".").absolute().parent.parent))

In [2]:
from dotenv import load_dotenv

# Load .env file
load_dotenv()

True

In [3]:
import datetime
from tqdm import tqdm
from src.logs import setup_logger
from src.s3 import Uploader
from src.anonymize import Anonymizer
from src.data_models import RawText
from src.settings import AppSettings
from src.perspective import PerspectiveAPI
from src.socials.youtube import YouTubeCrawler
from src.utils import (
    read_yaml,
    read_json,
    save_json,
    label_studio_fmt,
    normalize_raw_text
)

_logger = setup_logger(AppSettings().LOG_LEVEL)

In [4]:
def calc_toxic_rate(data):
    toxic_qty = len([i for i in data if i.is_toxic and i.toxicity_score])
    non_toxic_qty = len([i for i in data if i.is_toxic == False and i.toxicity_score])

    print(f"Toxic comments: {toxic_qty}")
    print(f"Non-toxic comments: {non_toxic_qty}")
    print(f"Toxic comments rate: {toxic_qty / (toxic_qty+non_toxic_qty):.4f}")

In [5]:
args = AppSettings()

args.YOUTUBE_MAX_COMMENTS_PER_VIDEO = 50000

params = read_yaml("../../properties/application.yaml")

In [6]:
videos = params["youtube"]
_logger.info(f"{len(videos)} videos to process.")

2022-04-07 00:08:51,712 :: INFO :: 2285299911 :: <module> :: 22 videos to process.


## Collecting comments by video ID

In [7]:
youtube = YouTubeCrawler(api_key=args.YOUTUBE_API_KEY)

In [11]:
data = []

_logger.info("Retrieving comments from Youtube.")

for video in videos:
    tmp_comments = youtube.get_video_comments(
        youtube._get_video_id(video["video"]),
        max_results=args.YOUTUBE_MAX_COMMENTS,
        max_comments=args.YOUTUBE_MAX_COMMENTS_PER_VIDEO)
        
    for comment in tmp_comments:
        comment.publisher_category = video["category"]

    data.extend(tmp_comments)

_logger.info(f"{len(data)} comments collected.")

2022-04-03 11:44:46,218 :: INFO :: 84881482 :: <module> :: Retrieving comments from Youtube.
2022-04-03 11:44:46,220 :: INFO :: 84881482 :: <module> :: 548365 comments collected.


## Perspective API

In [8]:
data = read_json("temp_2022-04-03-youtube.json")
data = [RawText(**i) for i in data]

len(data)

548365

In [10]:
if not isinstance(args.PERSPECTIVE_API_KEY, str):
    _logger.info("Perspective API key not set.")
    raise AttributeError("Perspective API key not set.")

_logger.info("Starting toxicity prediction.")

perspective = PerspectiveAPI(apikey=args.PERSPECTIVE_API_KEY)

wip_counter = 0
with tqdm(total=len(data)) as pbar:
    pbar.set_description("Toxicity prediction")
    for item in data:
        if item.toxicity_score is None:
            response = perspective.predict(text=item.text)
            if isinstance(response.get("TOXICITY"), float):
                item.toxicity_score = response.get("TOXICITY")
                if item.toxicity_score > args.PERSPECTIVE_THRESHOLD:
                    item.is_toxic = True
                else:
                    item.is_toxic = False
            
            # Update progress bar with the work in progress
            if wip_counter > 0:
                pbar.update(wip_counter)
                wip_counter = 0
                
            pbar.update(1)
        else:
            wip_counter += 1

toxic_rate = len([i for i in data if i.is_toxic]) / len(data)

_logger.info(f"Toxic comments rate: {toxic_rate:.4f}")

if args.FILTER_TOXIC_COMMENTS:
    data = [item for item in data if item.is_toxic]

_logger.info(f"Total comments: {len(data)}")

2022-04-07 00:11:55,377 :: INFO :: 2540684772 :: <module> :: Starting toxicity prediction.
Toxicity prediction:  58%|█████▊    | 320531/548365 [8:30:34<6:02:55, 10.46it/s]  


ServerNotFoundError: Unable to find the server at commentanalyzer.googleapis.com

In [11]:
calc_toxic_rate(data)

Toxic comments: 169736
Non-toxic comments: 150795
Toxic comments rate: 0.5295


In [12]:
save_json(normalize_raw_text(data), path="temp_2022-04-03-youtube.json")

## Anonymization

In [17]:
# Filter only toxic comments
data = [item for item in data if item.toxicity_score and item.is_toxic]
len(data)

169736

In [18]:
_logger.info("Starting anonymization.")

anonymizer = Anonymizer()

with tqdm(total=len(data)) as pbar:
    pbar.set_description("Anonymization")
    for item in data:
        item.text = anonymizer.apply_all(item.text)
        pbar.update(1)

_logger.info("Anonymization finished.")

2022-04-07 08:44:48,339 :: INFO :: 1352788321 :: <module> :: Starting anonymization.
2022-04-07 09:10:04,930 :: INFO :: 1352788321 :: <module> :: Anonymization finished.


## Upload to S3

In [19]:
_logger.info("Uploading data to S3.")

key = datetime.datetime.now().strftime("%Y-%m-%d") + ".json"

# Convert our data to dicts in Label Studio format
data = [label_studio_fmt(i) for i in data]

uploader = Uploader(bucket=args.AWS_S3_BUCKET,
                    bucket_prefix=args.AWS_S3_BUCKET_PREFIX)

if isinstance(args.AWS_ROLE_ARN, str):
    uploader.upload_sts(role_arn=args.AWS_ROLE_ARN,
                        session_name="ToxicityDetectionCrawler",
                        key=key, data=data)
elif isinstance(args.AWS_ACCESS_KEY_ID, str) and isinstance(args.AWS_SECRET_ACCESS_KEY, str):
    uploader.upload_aksk(access_key=args.AWS_ACCESS_KEY_ID,
                            secret_key=args.AWS_SECRET_ACCESS_KEY,
                            key=key, data=data)
else:
    _logger.error("AWS credentials not set.")
    raise AttributeError("AWS credentials not set.")

_logger.info("Data uploaded to S3.")

2022-04-07 09:10:50,044 :: INFO :: 887248544 :: <module> :: Uploading data to S3.
2022-04-07 09:11:14,504 :: INFO :: 887248544 :: <module> :: Data uploaded to S3.
