<a href="https://colab.research.google.com/github/AnanyaShetty21/YTgist/blob/main/YTgist.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import getpass
video_id = "6mgkoqcm6Sg"
api_key = getpass.getpass("Enter the API key: ")

# Installations and imports

In [None]:
!pip install youtube_transcript_api
!pip install pytextrank
!pip install plotly nltk
!pip install wordcloud matplotlib
!pip install sklearn

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import plotly.express as px
import nltk
from youtube_transcript_api import YouTubeTranscriptApi
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import re

# GPU

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

# Getting the transcript through youtube-transcript-api

In [None]:
ytt_api = YouTubeTranscriptApi()
transcript = ytt_api.fetch(video_id).to_raw_data()
text = ' '.join([entry['text'] for entry in transcript])
text = text.replace('\n',' ')
text = text.replace("\\", "")

# Summarizing using pre-trained model

In [None]:
tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/led-large-16384-pubmed")
model = AutoModelForSeq2SeqLM.from_pretrained("patrickvonplaten/led-large-16384-pubmed").to(device)

In [None]:
tokens = tokenizer(text, truncation = False, padding = "max_length", return_tensors = "pt")
tokens = {key: value.to(device) for key, value in tokens.items()}
summary = model.generate(**tokens)
summary_decoded = tokenizer.decode(summary[0])
print(summary_decoded)

# Extract key words using textrank

In [None]:
import spacy
import pytextrank

nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("textrank")

doc = nlp(text)

top_phrases = [phrase.text for phrase in doc._.phrases[:10]]

for phrase in top_phrases:
    print(phrase)


# Creating a timeline of key moments in the video

In [None]:
# Getting key moments from the video description
from googleapiclient.discovery import build
from IPython.display import JSON

def get_video_description(video_id):
    youtube = build("youtube", "v3", developerKey=api_key)

    request = youtube.videos().list(
        part="snippet",
        id=video_id
    )
    response = request.execute()

    if "items" in response and len(response["items"]) > 0:
        description = response["items"][0]["snippet"]["description"]
        return description

description = get_video_description(video_id)

def get_lines_with_timestamps(description):
    unique_pairs = set()
    lines = description.split("\n")
    for i in range(len(lines)):
        pairs = []
        for line in lines:
            line_with_timestamps = re.findall(r'\b\d{1,2}:\d{2}(?::\d{2})?\b', line)
            if line_with_timestamps:
                for timestamp in line_with_timestamps:
                    pairs.append((timestamp, line.replace(timestamp, "").strip()))
                    unique_pairs.update(pairs)

    return sorted(unique_pairs, key=lambda x: x[0])

key_moments_description = get_lines_with_timestamps(description)

In [None]:
# Getting key moments from the transcript
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = []
for line in transcript:
    corpus.append(line['text'])

vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(corpus)


importance = X.sum(axis=1).A1
important_indices = importance.argsort()[-5:][::-1]

key_moments = [(transcript[i]['start'], transcript[i]['text']) for i in important_indices]


In [None]:
import plotly.express as px
import pandas as pd


df = pd.DataFrame(key_moments, columns=["Timestamp", "Key Moment"])
df_description = pd.DataFrame(key_moments_description, columns=["Timestamp", "Key Moment"])

df_description["Timestamp"] = df_description["Timestamp"].astype(str)

for i in range(len(df_description["Timestamp"])):
    time = df_description.loc[i,"Timestamp"].split(":")
    if(len(time)==1):
      seconds = int(time[0])
    elif(len(time)==2):
      seconds = int(time[0]) * 60 + int(time[1])
    elif(len(time)==3):
      seconds = int(time[0]) * 3600 + int(time[1]) * 60 + int(time[2])
    df_description.loc[i, "Timestamp"] = seconds



fig = px.scatter(df, x="Timestamp", y="Key Moment", text="Key Moment", title="Timeline of Key Moments in Video")

fig.add_scatter(x = df_description["Timestamp"], y = df_description["Key Moment"], mode="markers+text", marker=dict(color="red", size=10), name="Description Moments", text=df_description["Key Moment"], textposition="top center")

fig.update_traces(textposition="top center")
fig.update_layout(xaxis_title="Timestamp in milliseconds")

fig.show()


# Wordcloud representation of the video

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = ""
for line in transcript:
    corpus = corpus + line['text']


vectorizer = TfidfVectorizer(stop_words='english', max_features=50)
X = vectorizer.fit_transform([corpus])
important_words = dict(zip(vectorizer.get_feature_names_out(), X.toarray()[0]))


wordcloud = WordCloud(width=800, height=400).generate_from_frequencies(important_words)


plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
