# My plans
1. Text -> textProcessing -> modelPredection -> good/bad score

2. yt link -> commentExtract -> textProcessing -> modelPredection -> good/bad score
3. twitter link -> commentExtract -> textProcessing -> modelPredection -> good/bad score


# Data Ingestion

## youtube

In [None]:
import os
import certifi

os.environ['SSL_CERT_FILE'] = certifi.where()

In [None]:
!pip install youtube-comment-downloader



In [None]:
from urllib.parse import urlparse, parse_qs

def extract_video_id(url):
    parsed_url = urlparse(url)
    # print(parsed_url)
    if 'youtu.be' in parsed_url.netloc:
        return parsed_url.path.strip('/')

    return None


In [None]:
v = extract_video_id('https://youtu.be/OLPwT05kYjw?si=vlkgvHWyquJVddHS')
v

'OLPwT05kYjw'

In [None]:
from youtube_comment_downloader import YoutubeCommentDownloader

def get_youtube_comments(video_url, max_comments=100):
    downloader = YoutubeCommentDownloader()
    if 'youtu.be' in video_url:
      video_id=extract_video_id(video_url)
    else:
      video_id = video_url.split('v=')[-1]

    comments = []
    for comment in downloader.get_comments_from_url(f"https://www.youtube.com/watch?v={video_id}"):
        comments.append(comment['text'])
        if len(comments) >= max_comments:
            break
    return comments

In [None]:
yt_comments = get_youtube_comments("https://www.youtube.com/watch?v=INxnoCQxfsI")

## twitter

In [None]:
# import requests

# headers = {
#     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
# }

# response = requests.get("https://twitter.com", headers=headers)
# print(response.status_code)


In [None]:
# !pip install snscrape

In [None]:
# import snscrape.modules.twitter as sntwitter
# def get_twitter_replies(video_url, max_comments=100):
#   tweet_id = video_url.split('/')[-1]
#   query = f'conversation_id:{tweet_id}'
#   replies=[]
#   for i,tweet in enumerate(sntwitter.TwitterSearchScraper(query).get_items()):
#     if i >=max_comments:
#       break
#     if tweet.inReplyToTweetId==int(tweet_id):
#       replies.appeng(tweet.content)
#   return replies

In [None]:
# get_twitter_replies('https://x.com/IRIran_Military/status/1933590328017121748')

# Data Preprocessing

In [None]:
!pip install text_prettifier



In [None]:
# yt_comments
from text_prettifier import TextPrettifier

prettifier = TextPrettifier()

def text_cleaner(text):
    text = text.lower()
    text = prettifier.remove_contractions(text)
    text = prettifier.remove_emojis(text)
    text = prettifier.remove_html_tags(text)
    text = prettifier.remove_urls(text)
    text = prettifier.remove_special_chars(text)
    text = prettifier.remove_stopwords(text)
    text = prettifier.remove_numbers(text)
    return text

In [None]:
yt_comments=comments = [
    "You're a complete waste of oxygen. Just disappear already.",          # toxic
    "I really enjoyed this video, thanks for sharing!",                    # non-toxic
    "How dumb can you be and still breathe? Unbelievable.",               # toxic
    "Great explanation, made everything so much clearer.",                # non-toxic
    "Nobody asked for your opinion, loser.",                              # toxic
    "This was super helpful, keep up the good work!",                     # non-toxic
    "Get off the internet, you pathetic excuse for a human.",             # toxic
    "I didn’t know stupidity was contagious until I watched this.",       # toxic
    "Loved the positive energy here, really uplifting!",                  # non-toxic
    "Typical garbage from your kind. Stay in your lane."                  # toxic
]


In [None]:
cleaned_text = [text_cleaner(text) for text in yt_comments]

In [None]:
!pip install langdetect



In [None]:
from langdetect import detect

def is_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False

# Filter out non-English comments
english_comments = [text for text in cleaned_text if is_english(text)]


In [None]:
comments = [c for c in english_comments if c.strip()]
comments

['complete waste oxygen disappear already',
 'really enjoyed video thanks sharing',
 'dumb still breathe unbelievable',
 'great explanation made everything much clearer',
 'nobody asked opinion loser',
 'get internet pathetic excuse human',
 'know stupidity contagious watched',
 'loved positive energy really uplifting',
 'typical garbage kind stay lane']

# model Training




In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TextClassificationPipeline

model_path = "martin-ha/toxic-comment-model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

pipeline =  TextClassificationPipeline(model=model, tokenizer=tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Device set to use cpu


In [None]:
result = [pipeline(c) for c in comments]

In [None]:
flat_preds = [x[0] for x in result]

In [None]:
flat_preds

[{'label': 'non-toxic', 'score': 0.9864069819450378},
 {'label': 'non-toxic', 'score': 0.9990127086639404},
 {'label': 'toxic', 'score': 0.9481373429298401},
 {'label': 'non-toxic', 'score': 0.9987552165985107},
 {'label': 'toxic', 'score': 0.878703236579895},
 {'label': 'toxic', 'score': 0.950354814529419},
 {'label': 'toxic', 'score': 0.9561506509780884},
 {'label': 'non-toxic', 'score': 0.9982788562774658},
 {'label': 'toxic', 'score': 0.9434476494789124}]

In [None]:
labels = [entry['label'] for entry in flat_preds]
scores = [entry['score'] for entry in flat_preds]

In [None]:
from collections import Counter
import numpy as np

most_common_label = Counter(labels).most_common(1)[0][0]
most_common_label_count = Counter(labels).most_common(1)[0][1]
percentage = (most_common_label_count/len(labels))*100
print(f"Your comments are {percentage:.1f}% {most_common_label} with an average model confidence of {np.mean(scores):.2f}.")
# percentage

Your comments are 55.6% toxic with an average model confidence of 0.96.


In [None]:
model.save_pretrained('distilbert=toxic-model-v1')
tokenizer.save_pretrained('distilbert=toxic-tokenizer-v1')

In [None]:
pipeline('nobody asked opinion loser')

[{'label': 'toxic', 'score': 0.878703236579895}]