# Step 1: Creating a Corpus:

## Finding a playlist of episodes of the SNL Weekend update, from that we extract the video ID from the videos by traversing through the playlist.

In [1]:
pip install google-api-python-client



In [44]:
from googleapiclient.discovery import build
import re

# Function to extract the playlist ID from a YouTube playlist URL
def extract_playlist_id(url):
    # Regular expression to extract the playlist ID
    playlist_id_pattern = r'list=([a-zA-Z0-9_-]+)'
    match = re.search(playlist_id_pattern, url)
    if match:
        return match.group(1)
    else:
        raise ValueError('Invalid YouTube playlist URL')

# Function to get the list of video links from a playlist
def get_video_links_from_playlist(api_key, playlist_url):
    # Extract the playlist ID from the URL
    playlist_id = extract_playlist_id(playlist_url)

    # Build the YouTube service
    youtube = build('youtube', 'v3', developerKey=api_key)

    video_links = []
    next_page_token = None

    while True:
        # Request playlist items
        request = youtube.playlistItems().list(
            part='contentDetails',
            maxResults=50,
            playlistId=playlist_id,
            pageToken=next_page_token
        )
        response = request.execute()

        # Extract video IDs and construct video URLs
        for item in response['items']:
            video_id = item['contentDetails']['videoId']
            video_links.append(f'https://www.youtube.com/watch?v={video_id}')

        # Check if there is another page
        next_page_token = response.get('nextPageToken')
        if not next_page_token:
            break

    return video_links
list_of_links = []
# Example usage
if __name__ == '__main__':
    api_key = 'AIzaSyANgCE5SWAqHmcy31DhvjKVIz2KoEcDHfE'  # Replace with your YouTube Data API key
    playlist_url = 'https://www.youtube.com/playlist?list=PLS_gQd8UB-hJZaYQLoQJ4XowUpXXjFxaV'  # Replace with the YouTube playlist URL

    try:
        video_links = get_video_links_from_playlist(api_key, playlist_url)
        for link in video_links:
            print(link)
            list_of_links.append(link)
    except ValueError as e:
        print(f"Error: {e}")


https://www.youtube.com/watch?v=IPnHneSH1sg
https://www.youtube.com/watch?v=2BdgF_Fr6QU
https://www.youtube.com/watch?v=y_V1K9uTrH4
https://www.youtube.com/watch?v=aaBLGa2UIzc
https://www.youtube.com/watch?v=Vpq7Ax_rPds
https://www.youtube.com/watch?v=SgT8FBuMx7A
https://www.youtube.com/watch?v=yfRkdbw-cOM
https://www.youtube.com/watch?v=XEXWPykXJ5w
https://www.youtube.com/watch?v=wbk-QGDaM7Q
https://www.youtube.com/watch?v=5RmEKh83Etg
https://www.youtube.com/watch?v=EiHMVX0iV0A
https://www.youtube.com/watch?v=kq3NsBBYSUA
https://www.youtube.com/watch?v=Clfeis1avy0
https://www.youtube.com/watch?v=5J4651_oo3Y
https://www.youtube.com/watch?v=VsSTA8X0E-c
https://www.youtube.com/watch?v=6euomDxdHsY
https://www.youtube.com/watch?v=e1uFNh8LXAg
https://www.youtube.com/watch?v=s8vecsTUDzU
https://www.youtube.com/watch?v=HWsNgDF07h4
https://www.youtube.com/watch?v=NCfrDa5qSSs
https://www.youtube.com/watch?v=HoU95Sdmag4
https://www.youtube.com/watch?v=IysivVkJ_ig
https://www.youtube.com/watch?v=

## The characters after 'v=' in a youtuve video link identifies the video ID. We extract this ID as a substring. We use YouTube Data API to get the video subtitles transcript as a json file.

In [45]:
pip install youtube_transcript_api



In [46]:
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import JSONFormatter
import json

In [47]:
len(list_of_links)

52

In [55]:
# Open the JSON file in write mode to erase its content
with open('videorawtranscript.json', 'w') as file:
    pass  # The file is now completely empty
with open('transcriptedtext.txt', 'w', encoding='utf-8') as file:
    pass

## This json files has text along with duration and timestamps of each dialogue, we remove the timestamps and save the texts as a .txt file. This forms our Corpus.

In [60]:
for link in list_of_links:
    video_id = link.split("v=", 1)[1]
    print("Vid ID: ", video_id)
    #print(f"Vid ID{counter}: ", video_id)
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
    except Exception as e:
        print(f"No transcript found for video ID {video_id}: {e}")
        continue  # Skip to the next video

    formatter = JSONFormatter()
    json_formatted = formatter.format_transcript(transcript)

    # Append the JSON to an existing file
    with open('videorawtranscript.json', 'w', encoding='utf-8') as json_file:
      json_file.write(json_formatted)
    # Load the JSON data from the file
    with open('videorawtranscript.json', 'r') as file:
        data = json.load(file)

    # Open a new text file for writing the extracted text
    with open('transcriptedtext.txt', 'a', encoding='utf-8') as file:
        # Iterate over each object in the JSON data
        for entry in data:
            # Write the 'text' part to the text file
            file.write(entry['text'] + '\n')


Vid ID:  IPnHneSH1sg
Vid ID:  2BdgF_Fr6QU
Vid ID:  y_V1K9uTrH4
Vid ID:  aaBLGa2UIzc
Vid ID:  Vpq7Ax_rPds
Vid ID:  SgT8FBuMx7A
Vid ID:  yfRkdbw-cOM
Vid ID:  XEXWPykXJ5w
Vid ID:  wbk-QGDaM7Q
Vid ID:  5RmEKh83Etg
Vid ID:  EiHMVX0iV0A
Vid ID:  kq3NsBBYSUA
Vid ID:  Clfeis1avy0
Vid ID:  5J4651_oo3Y
Vid ID:  VsSTA8X0E-c
Vid ID:  6euomDxdHsY
Vid ID:  e1uFNh8LXAg
Vid ID:  s8vecsTUDzU
Vid ID:  HWsNgDF07h4
Vid ID:  NCfrDa5qSSs
Vid ID:  HoU95Sdmag4
Vid ID:  IysivVkJ_ig
Vid ID:  if3CC51JiSA
Vid ID:  5FUfNQ3yGmU
Vid ID:  k0YJ-nxLdjY
Vid ID:  sJLPMDG8UBc
Vid ID:  pm5UACgH3TA
Vid ID:  KGSdOg36pWI
Vid ID:  I4xd_R_tyo8
Vid ID:  k8ou872fyjI
Vid ID:  v2WQr1z7puA
Vid ID:  A8bvySo1UnI
Vid ID:  uSHfgmf-CRE
Vid ID:  soR7LbGu390
Vid ID:  qxyt12dFq9E
Vid ID:  XPcEzzScN9E
Vid ID:  0E_fcrq52ws
Vid ID:  nzNL0b4d_WY
Vid ID:  e0xgWk563G8
Vid ID:  DB9KHoBPEyU
Vid ID:  zrlL-TsRn_I
Vid ID:  L0JdIqO9MzY
Vid ID:  BTiv0jCsZqk
Vid ID:  eyfWIO2hA7M
Vid ID:  RLucCCdaX0c
Vid ID:  vE00tOXtlTw
Vid ID:  atC3XHP7lgM
Vid ID:  O_iL

## We then clean the data or unnecesary characters and empty lines.


In [61]:
import re

def clean_text(text):
    # Remove text within square brackets including the brackets
    text = re.sub(r'\[.*?\]', '', text)

    # Remove '>' characters
    text = text.replace('>', '')

    # Remove empty lines
    text = "\n".join([line for line in text.splitlines() if line.strip() != ""])

    return text

def process_file(input_file, output_file):
    # Read the input file
    with open(input_file, 'r') as file:
        text = file.read()

    # Clean the text
    cleaned_text = clean_text(text)

    # Write the cleaned text to the output file
    with open(output_file, 'w') as file:
        file.write(cleaned_text)

# Specify input and output file paths
input_file = 'transcriptedtext.txt'
output_file = 'cleantranscriptedtext.txt'

# Process the file
process_file(input_file, output_file)


# Step 2: Tokenization of Corpus:

## We use NLTK library to form tokens using sentence grammar and punctuations.

In [64]:
import nltk
from nltk.tokenize import sent_tokenize

# Download necessary NLTK data
nltk.download('punkt')

# Load your text corpus
with open('cleantranscriptedtext.txt', 'r', encoding='utf-8') as file:
    corpus = file.read()

# Split the corpus into sentences
sentences = sent_tokenize(corpus)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Step 3: Usin Hugging-Face tokens:

## We use AutoModelForQuestionAnswering and AutoTokenizer from transformers to train the model which is distilbert-base-uncased-distilled-squad on our context. Our context here is the Corpus.

In [65]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

# Load pre-trained model and tokenizer
model_name = "distilbert-base-uncased-distilled-squad"
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Create a question-answering pipeline
qa_pipeline = pipeline('question-answering', model=model, tokenizer=tokenizer)

In [66]:
def answer_question(question, context):
    inputs = tokenizer(question, context, add_special_tokens=True, return_tensors="pt")
    answer_start_scores, answer_end_scores = model(**inputs)
    answer_start = torch.argmax(answer_start_scores)
    answer_end = torch.argmax(answer_end_scores) + 1
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))
    return answer

def chatbot(question, corpus):
    response = qa_pipeline({'question': question, 'context': corpus})
    return response['answer']


# Step 4: Testing:

In [67]:
question = input(str("\nQuestion: "))
print("Answer: ", chatbot(question, corpus))


Question: How is the weather?
Answer:  ICE BREAKER


# Corpus Analysis/Overview

In [69]:
import nltk
from collections import Counter
from nltk.tokenize import word_tokenize

# Download necessary NLTK data files
nltk.download('punkt')

def analyze_corpus(text):
    # Tokenize the text into words
    tokens = word_tokenize(text)

    # Calculate the total number of tokens (words)
    total_tokens = len(tokens)

    # Calculate the vocabulary (unique words)
    vocabulary = set(tokens)

    # Calculate the number of unique words (types)
    unique_words = len(vocabulary)

    # Calculate word frequencies
    word_freq = Counter(tokens)

    # Calculate hapax legomena (words that occur only once)
    hapax_legomena = [word for word, freq in word_freq.items() if freq == 1]
    num_hapax_legomena = len(hapax_legomena)

    return {
        'total_tokens': total_tokens,
        'unique_words': unique_words,
        'vocabulary': vocabulary,
        'num_hapax_legomena': num_hapax_legomena,
        'word_frequencies': word_freq
    }

# Read the input text file
input_file = 'cleantranscriptedtext.txt'
with open(input_file, 'r') as file:
    text = file.read()

# Analyze the corpus
results = analyze_corpus(text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [72]:
print(f"Total Tokens: {results['total_tokens']}\n")
print(f"Unique Words (Types): {results['unique_words']}\n")
print(f"Number of Hapax Legomena: {results['num_hapax_legomena']}\n")
print(f"Vocabulary: {results['vocabulary']}\n")
print(f"Word Frequencies: {results['word_frequencies']}")

Total Tokens: 101619

Unique Words (Types): 8783

Number of Hapax Legomena: 4225


