In [3]:
import nltk
import spacy
from youtube_transcript_api import YouTubeTranscriptApi
import numpy as np 
import re
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
nlp= spacy.load("en_core_web_sm")

In [5]:
def get_transcript(video_id):
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)  #transcript stores in dictionaries inside list , where key is 'text'
        transcript_text = " ".join([item['text'] for item in transcript]) #to get values of all the dictionaries and combine them to form a string text.
        return transcript_text
    except Exception as e:
        print("Error fetching transcript:",e)
        return None


In [6]:
def clean_text(text):
   
    text = re.sub(r'\[.*?\]', '', text)  # Remove non- verbal sounds like [music]
    
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    
    text = text.strip()
    return text
    

In [7]:
def preprocess_text(text):
    text = clean_text(text)
    text = text.lower()
    doc=nlp(text)
    
    stemmer=PorterStemmer()
    stemmed_text = [ stemmer.stem(token.text) for token in doc ] #nltk

    lemmatized_text = [token.lemma_ for token in doc]  #spacy

    stop_words = set(stopwords.words('english'))
    filtered_text=[]
    
    for word in lemmatized_text:
        if word not in stop_words:
            filtered_text.append(word)

    text=" ".join(filtered_text)
    text = re.sub(r'[^\w\s]+', '', text)# Remove punctuation and music symbol or any symbol
    
    return text
    

In [27]:
def summarizer1(text, n=3):
    doc=nlp(text)
    
    processed_text = preprocess_text(text).split()
    
    pos_tags= pos_tag(processed_text) #stores as list of tuples 
    
    word_freq = {}
    for word,pos in pos_tags:
            if pos=="NN" or pos=="VBP":
                word_freq[word] = word_freq.get(word,0)+ 1 #default value is 0

    sentence_score={}
    for sentence in doc.sents:
        sentence_str = clean_text(sentence.text)
        for token in sentence:
            if token.text in word_freq:
                sentence_score[sentence_str] = sentence_score.get(sentence_str,0)+ word_freq[token.text]
                
    summarized_text= sorted(sentence_score, key=sentence_score.get, reverse=True )[:n]
    
    return " ".join(s.capitalize() for s in summarized_text)

In [25]:
def summarizer2(text, n=3):
    doc = nlp(text)
    sentences= []
    for sentence in doc.sents:
        sentences.append(sentence.text)
    
    tfidf_vectorizer = TfidfVectorizer(stop_words="english")
    tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
    
    sentence_score = np.array(tfidf_matrix.sum(axis=1)).flatten()
    
    top_sent_indices= sentence_score.argsort()[-n:][::-1]
    
    top_sents= [sentences[i] for i in top_sent_indices]
    
    return " ".join(s.capitalize() for s in top_sents)

In [29]:
video_url = input("Enter a youtube video url:")
n= int(input("Enter no. of lines of summary needed:"))

if "v=" in video_url:
    video_id = video_url.split("v=")[-1]
else:
    video_id = video_url.split("be/")[-1]     #returns a list
    
text = get_transcript(video_id)

print("\nSUMMARY OF THE TRANSCRIPT: OPTION 1")
summarized_text1= summarizer1(text, n)
print(summarized_text1)

print("\nSUMMARY OF THE TRANSCRIPT: OPTION 2")
summarized_text2= summarizer2(text, n)
print(summarized_text2)


Enter a youtube video url: https://www.youtube.com/watch?v=WHYMGNbPv2U
Enter no. of lines of summary needed: 5



SUMMARY OF THE TRANSCRIPT: OPTION 1
Maybe you want to read a book, maybe you want to watch tv or if we look over here you’ll see that this living room also has a tv, a television, where you could sit just to have a quiet relaxing evening where you are going to entertain yourself by reading a book, maybe sitting and talking with friends, or watching some television. The living room is the room in the home where you will usually find chairs and sofas or what we also call couches. So especially in the winter when you come into a home you might want to hang your coat in the closet if there’s not enough room on the coat hangers. It’s the room where, when you just want to relax, this is the room that you would generally sit in. Before we go any further though, i want to thank barb the canadian for letting me use her house today to make this video, to help you learn english.

SUMMARY OF THE TRANSCRIPT: OPTION 2
Maybe you want to read a book, maybe you want
to watch tv or if we look over here

*USING YOUTUBE DATA API* (not working)


In [1]:
import requests
import json

In [3]:
def fetch_video_captions(video_id, api_key):
    url = f'https://www.googleapis.com/youtube/v3/captions'
    parameters = {'part': 'snippet', 'videoId': video_id, 'key': api_key}
    response = requests.get(url, params=parameters)
    print(response)
    if response.status_code == 200:
        captions = response.json() #converts json to dictionary
        print(captions,"\n\n")
        if 'items' in captions:
            return captions['items']
        else:
            print(f"No captions found for video ID: {video_id}")
            return None
    else:
        print(f"Error: {response.status_code}")
        return None

In [4]:
def fetch_caption_data(caption_id,api_key):
    url = f'https://www.googleapis.com/youtube/v3/captions/{caption_id}'
    print(url)
    parameters = {'key': api_key}
    response= requests.get(url, params=parameters)
    print(response)
    
    if response.status_code == 200:
        caption_data = response.json() #converts json to dictionary
        print(caption_data,"\n")
        return caption_data.get('body', 'No caption text available.') #(key, default value when key is not found)
    else:
        print(f"Error fetching caption data: {response.status_code}")
        return None

In [7]:
#main
api_key = "AIzaSyB5IcpVqIP1x654reS-wXc8sjFamGVRYkI"
video_url = input("Enter a youtube video url:")
n= int(input("Enter no. of lines of summary needed:"))

if "v=" in video_url:
    video_id1 = video_url.split("v=")[-1]
else:
    video_id1 = video_url.split("be/")[-1]     #returns a list
    

captions = fetch_video_captions(video_id1, api_key)

if captions:
    for caption in captions:
        caption_id = caption['id']
        print(caption_id)
        caption_text = fetch_caption_data(caption_id,api_key)
        print(caption_text)
    """
    print("\nSUMMARY OF THE TRANSCRIPT: OPTION 1")
    summarized_text1= summarizer1(caption_text, n)
    print(summarized_text1)
    
    print("\nSUMMARY OF THE TRANSCRIPT: OPTION 2")
    summarized_text2= summarizer2(caption_text, n)
    print(summarized_text2)
    """

Enter a youtube video url: https://www.youtube.com/watch?v=P6FORpg0KVo
Enter no. of lines of summary needed: 2


<Response [200]>
{'kind': 'youtube#captionListResponse', 'etag': 'Di1QnkspdD7_J8S2YL_lTSZ3r6M', 'items': [{'kind': 'youtube#caption', 'etag': 'RC7-Z6ud2vsgqdICS5zO-HLlI20', 'id': 'AUieDabollJgQWFrqGaLbo_zSpRXy2dHA9Ctpg1NgIvuQyFp', 'snippet': {'videoId': 'P6FORpg0KVo', 'lastUpdated': '2023-12-14T15:33:29.752106Z', 'trackKind': 'standard', 'language': 'pt-BR', 'name': '', 'audioTrackType': 'unknown', 'isCC': False, 'isLarge': False, 'isEasyReader': False, 'isDraft': False, 'isAutoSynced': False, 'status': 'serving'}}, {'kind': 'youtube#caption', 'etag': 'Lzx14N9weBZkFpA00-4tbGnbQjc', 'id': 'AUieDaakGrNfJfPCdkhEl41fKWPbzCijOPGf6TFtmFyo', 'snippet': {'videoId': 'P6FORpg0KVo', 'lastUpdated': '2024-04-08T13:03:34.408052Z', 'trackKind': 'standard', 'language': 'id', 'name': '', 'audioTrackType': 'unknown', 'isCC': False, 'isLarge': False, 'isEasyReader': False, 'isDraft': False, 'isAutoSynced': False, 'status': 'serving'}}, {'kind': 'youtube#caption', 'etag': 'qrwG2AAqA3sI9JqoSUe-1oFmBOQ', 'i

In [None]:
"""from google_auth_oauthlib.flow import InstalledAppFlow

# Define the scopes for YouTube API access
SCOPES = ["https://www.googleapis.com/auth/youtube.force-ssl"]

# Load the client_secret.json file and start the OAuth process
flow = InstalledAppFlow.from_client_secrets_file("client_secret1.json", SCOPES)

# Use the run_local_server() method for authentication
credentials = flow.run_local_server(port=0)

# Print the access token to confirm authentication
print("Access Token:", credentials.token)"""


Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=317625184281-t802o0985a9vg41hfevtehn602v4lv7b.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A50617%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fyoutube.force-ssl&state=k1MpEhIFIwIQgRSFfjzlqMe6gzAWdm&access_type=offline
