In [1]:
!pip install textstat
!pip install yt-dlp

Collecting textstat
  Downloading textstat-0.7.4-py3-none-any.whl.metadata (14 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.16.0-py3-none-any.whl.metadata (3.2 kB)
Downloading textstat-0.7.4-py3-none-any.whl (105 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyphen-0.16.0-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.16.0 textstat-0.7.4
Collecting yt-dlp
  Downloading yt_dlp-2024.8.6-py3-none-any.whl.metadata (170 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m170.1/170.1 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting brotli (from yt-dlp)
  Downloading Brotli-1.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.wh

In [2]:
import requests
import json
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
import pandas as pd
from googleapiclient.discovery import build
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from transformers import RobertaTokenizerFast, TFRobertaForSequenceClassification, pipeline
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
import torch
import textstat
import subprocess
import warnings
import os
import requests

In [3]:
warnings.filterwarnings("ignore")

In [4]:
# Extract video ID from YouTube URL
def extract_video_id(url):
    return url.split('=')[1]

# Fetch subtitles from YouTube video
def get_subtitles(video_id):
    url = f"https://www.youtube.com/watch?v={video_id}"
    command = 'yt-dlp -x --audio-format mp3 --output "/content/custom_filename.mp3" ' + url
    res = subprocess.run(command, shell=True, capture_output=True, text=True)

    pipevoive = pipeline("automatic-speech-recognition", model="openai/whisper-small")
    result = pipevoive('/content/custom_filename.mp3')

    file_path = '/content/custom_filename.mp3'
    if os.path.exists(file_path):
        os.remove(file_path)
    return result['text']

# Get YouTube comments
def get_replies(youtube, parent_id, video_id):
    replies = []
    next_page_token = None

    while True:
        reply_request = youtube.comments().list(
            part="snippet",
            parentId=parent_id,
            textFormat="plainText",
            maxResults=100,
            pageToken=next_page_token
        )
        reply_response = reply_request.execute()

        for item in reply_response['items']:
            comment = item['snippet']
            replies.append({
                'Timestamp': comment['publishedAt'],
                'Username': comment['authorDisplayName'],
                'VideoID': video_id,
                'Comment': comment['textDisplay'],
                'Date': comment['updatedAt'] if 'updatedAt' in comment else comment['publishedAt']
            })

        next_page_token = reply_response.get('nextPageToken')
        if not next_page_token:
            break

    return replies

def get_comments_for_video(youtube, video_id):
    all_comments = []
    next_page_token = None

    while True:
        comment_request = youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            pageToken=next_page_token,
            textFormat="plainText",
            maxResults=100
        )
        comment_response = comment_request.execute()

        for item in comment_response['items']:
            top_comment = item['snippet']['topLevelComment']['snippet']
            all_comments.append({
                'Timestamp': top_comment['publishedAt'],
                'Username': top_comment['authorDisplayName'],
                'VideoID': video_id,
                'Comment': top_comment['textDisplay'],
                'Date': top_comment['updatedAt'] if 'updatedAt' in top_comment else top_comment['publishedAt']
            })

            if item['snippet']['totalReplyCount'] > 0:
                all_comments.extend(get_replies(youtube, item['snippet']['topLevelComment']['id'], video_id))

        next_page_token = comment_response.get('nextPageToken')
        if not next_page_token:
            break

    return all_comments

def summarize_text(text):
    model_name = "google/pegasus-xsum"
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name)

    # Split text into chunks for summarization
    max_chunk = 1024
    chunks = [text[i:i + max_chunk] for i in range(0, len(text), max_chunk)]
    batch = tokenizer.prepare_seq2seq_batch(chunks, truncation=True, padding='longest', max_length=60, return_tensors="pt")
    translated = model.generate(**batch)
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)

    summary = " ".join(tgt_text).replace(' .', '.').replace('( ', '(').replace(' )', ')')
    return summary

def analyze_sentiment(comments):
    pipe = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-sentiment-latest")

    labels = []
    for comment in comments:
        try:
            result = pipe(comment)
            labels.append(result[0]['label'])
        except:
            continue
    return {label: labels.count(label) for label in set(labels)}

def evaluate_text_difficulty(text):
    fre = textstat.flesch_reading_ease(text)
    fkgl = textstat.flesch_kincaid_grade(text)
    def classify_text(fre):
        if fre > 60:
            return "Easy"
        elif fre > 30:
            return "Medium"
        else:
            return "Hard"
    text_difficulty = classify_text(fre)
    return fre, fkgl, text_difficulty

def get_general_data(video_id):
    base_url = f"https://returnyoutubedislikeapi.com"
    endpoint = f"/votes?videoId={video_id}"
    urla = base_url + endpoint
    response = requests.get(urla)
    data = response.json()
    return data

In [5]:
url = input("Enter the YouTube URL: ")
video_id = extract_video_id(url)

api_key = 'AIzaSyC8yrYCNV4zgp4R0IGiSIqv-JrLPRu-JFc'
youtube = build('youtube', 'v3', developerKey=api_key)

Enter the YouTube URL: https://www.youtube.com/watch?v=69Tzh_0lHJ8


In [6]:
subtitles = get_subtitles(video_id)

config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [7]:
video_comments = get_comments_for_video(youtube, video_id)
comments_df = pd.DataFrame(video_comments)
comments = comments_df['Comment'].tolist()

In [8]:
summary = summarize_text(subtitles)

tokenizer_config.json:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.52M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/259 [00:00<?, ?B/s]

In [9]:
sentiment_analysis = analyze_sentiment(comments)

config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [10]:
fre, fkgl, text_difficulty = evaluate_text_difficulty(subtitles)

In [11]:
data = get_general_data(video_id)

In [12]:
print(f"Summary:\n{summary}")
print(f"Sentiment Analysis:\n{sentiment_analysis}")
print(f"Flesch Reading Ease: {fre}")
print(f"Flesch-Kincaid Grade Level: {fkgl}")
print(f"Text Difficulty: {text_difficulty}")
print(f"Likes: {data['likes']}")
print(f"Estimated Dislikes: {data['dislikes']}")
print(f"Total Views: {data['viewCount']}")
print(f"rating: {data['rating']}")

Summary:
TimescaleDB is an open source database built on top of workloads that can query time-based workloads exponentially faster. Timescale's database-as-a-service (DB-as-a-service) gives you the power of a private cloud with the flexibility of a public one. Let's say we want to show how much money is paid for a taxi each day.
Sentiment Analysis:
{'negative': 120, 'neutral': 314, 'positive': 110}
Flesch Reading Ease: 67.38
Flesch-Kincaid Grade Level: 9.0
Text Difficulty: Easy
Likes: 14827
Estimated Dislikes: 450
Total Views: 288927
rating: 4.8821758198599206


In [None]:
print(subtitles)

 The timescale DB, an open source time series database built on top of Postgres that can query time-based workloads exponentially faster. Imagine building an application that needs to collect terabytes of timestamped data points every day, like a website analytics platform or an IoT smart toilet. A general-purpose SQL database like MySQL or Postgres is not ideal. It doesn't ingest data fast enough and isn't optimized to scale queries of time series data. A timescale DB fixes this by storing data in hyper tables, which are abstract tables composed of many smaller postgres tables called chunks. This can increase ingest speeds up to 40% and make queries a staggering 350 times faster. In addition, time buckets can automatically aggregate data into set intervals, while hyper functions allow you to analyze this data like a mad scientist. They can compute time-weighted averages for financial data and build candlestick charts, or aggregate the daily water consumption of your IoT smart toilet. 