In [1]:
!pip install youtube_transcript_api



In [2]:
import youtube_transcript_api
from youtube_transcript_api import YouTubeTranscriptApi
import nltk
import re
from nltk.corpus import stopwords
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
link = "https://www.youtube.com/watch?v=W6NZfCO5SIk" 
unique_id = link.split("=")[-1]
sub = YouTubeTranscriptApi.get_transcript(unique_id)  
subtitle = " ".join([x['text'] for x in sub])


Using TF-IDF

In [4]:
from nltk.tokenize import sent_tokenize

In [5]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ankit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:

sentences = sent_tokenize(subtitle)

In [12]:
subtitle

"In this 3-minute introduction, I'm going\nto answer four frequently asked questions about JavaScript. \nWhat is JavaScript, what can you do with it, where does JavaScript code run and\nwhat is the difference between JavaScript and ECMAScript. So let's\nstart with the first question. What is JavaScript? JavaScript is one of the most\npopular and widely used programming languages in the world right now. It's\ngrowing faster than any other programming languages and big companies\nlike Netflix, Walmart, and PayPal build entire applications around JavaScript.\nAnd here's the average salary of a JavaScript developer in the United\nStates. That is $72,000 a year. So it's a great opportunity to get a good\njob out of learning JavaScript. You can work as a front-end developer or a\nback-end developer or a full stack developer who knows both the front end\nand the back end. Now, the second question. What can you do with JavaScript? For a\nlong time, javascript was only used in browsers to\nbuil

In [13]:
organized_sent = {k:v for v,k in enumerate(sentences)}

In [14]:
tf_idf = TfidfVectorizer(min_df=2, 
                                    strip_accents='unicode',
                                    max_features=None,
                                    lowercase = True,
                                    token_pattern=r'w{1,}',
                                    ngram_range=(1, 3), 
                                    use_idf=1,
                                    smooth_idf=1,
                                    sublinear_tf=1,
                                    stop_words = 'english')

In [15]:
import numpy as np


In [16]:
from numpy import *

In [17]:
sentence_vectors = tf_idf.fit_transform(sentences)
sent_scores = np.array(sentence_vectors.sum(axis=1)).ravel()



In [18]:
N = 3
top_n_sentences = [sentences[index] for index in np.argsort(sent_scores, axis=0)[::-1][:N]]

In [19]:
# mapping the scored sentences with their indexes as in the subtitle
mapped_sentences = [(sentence,organized_sent[sentence]) for sentence in top_n_sentences]
# Ordering the top-n sentences in their original order
mapped_sentences = sorted(mapped_sentences, key = lambda x: x[1])
ordered_sentences = [element[0] for element in mapped_sentences]
# joining the ordered sentence
summary = " ".join(ordered_sentences)

In [20]:
summary

'What is JavaScript, what can you do with it, where does JavaScript code run and\nwhat is the difference between JavaScript and ECMAScript. JavaScript is one of the most\npopular and widely used programming languages in the world right now. You can work as a front-end developer or a\nback-end developer or a full stack developer who knows both the front end\nand the back end.'

Summarization using BART

In [16]:
!pip install transformers

Collecting transformers
  Using cached transformers-4.18.0-py3-none-any.whl (4.0 MB)
Collecting huggingface-hub<1.0,>=0.1.0
  Using cached huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Using cached tokenizers-0.11.6-cp39-cp39-win_amd64.whl (3.2 MB)
Collecting sacremoses
  Using cached sacremoses-0.0.49-py3-none-any.whl (895 kB)
Installing collected packages: tokenizers, sacremoses, huggingface-hub, transformers
Successfully installed huggingface-hub-0.5.1 sacremoses-0.0.49 tokenizers-0.11.6 transformers-4.18.0


In [15]:
import transformers
from transformers import BartTokenizer, BartForConditionalGeneration

In [16]:
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

In [29]:
input_tensor = tokenizer.encode( subtitle, return_tensors="pt", max_length=512,truncation=True)

In [30]:
outputs_tensor = model.generate(input_tensor, max_length=160, min_length=120, length_penalty=2.0, num_beams=4, early_stopping=True)
outputs_tensor

tensor([[    2,     0, 18434,    16,    65,     9,     5,   144,  3924,   341,
           586,   119,  1023,   784, 11993,  3443,   939,     5,   232,     4,
            85,    18,   341,     7,  1119,   455,    12, 42671,  3748,    50,
          1830,  3798,    25,   157,    25,   588,    86,  4400,  6014,  1023,
           518,     4,    20,   674,  5391,     9,    10, 18434,  6596,   939,
             5,   121,  4560,   532,    16,    68,  4956,     6,   151,    10,
            76,     4,   370,    64,   120,    10,   205,   633,    66,     9,
          2239, 18434,    19,    84,  4704,     7,     5,  2777,     8,   141,
            24,    18,   341,    11,     5,   588,   232,     4,   166,   581,
            67,  1067,    59,     5,  5550,   227, 18434,     8, 11270, 32804,
         26083,     4,   166,  1034,    42,  4704,    40,   244,    47,   120,
             5,   144,    66,     9,   110,  2655,     9,     5,  2777,     4,
             2]])

In [32]:
print(tokenizer.decode(outputs_tensor[0]))

</s><s> JavaScript is one of the most widely used programmig laguages i the world. It's used to build full-blow web or mobile apps as well as real time etworkig services. The average salary of a JavaScript developer i the Uited States is $72,000 a year. You can get a good job out of learning JavaScript with our guide to the language and how it's used in the real world. We'll also talk about the differences between JavaScript and ECMAScript. We hope this guide will help you get the most out of your knowledge of the language.</s>


In [20]:
from transformers import pipeline

In [None]:
summarizer = pipeline('summarization')

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 (https://huggingface.co/sshleifer/distilbart-cnn-12-6)


In [25]:
print(subtitle)

I this 3-miute itroductio, I'm goig
to aswer four frequetly asked questios about JavaScript. 
What is JavaScript, what ca you do with it, where does JavaScript code ru ad
what is the differece betwee JavaScript ad ECMAScript. So let's
start with the first questio. What is JavaScript? JavaScript is oe of the most
popular ad widely used programmig laguages i the world right ow. It's
growig faster tha ay other programmig laguages ad big compaies
like Netflix, Walmart, ad PayPal build etire applicatios aroud JavaScript.
Ad here's the average salary of a JavaScript developer i the Uited
States. That is $72,000 a year. So it's a great opportuity to get a good
job out of learig JavaScript. You ca work as a frot-ed developer or a
back-ed developer or a full stack developer who kows both the frot ed
ad the back ed. Now, the secod questio. What ca you do with JavaScript? For a
log time, javascript was oly used i browsers to
build iteractive web pages some developers refer to javascript as a toy


In [31]:
summary = summarizer(subtitle, max_length = 180, min_length = 30)

IndexError: index out of range in self