In [2]:
from googleapiclient.discovery import build
from youtube_transcript_api import YouTubeTranscriptApi
import pandas as pd
import re
import os

import numpy as np
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize
import nltk
import spacy


In [4]:
from dotenv import load_dotenv

# Load YouTube API key
load_dotenv()

api_key = os.getenv("API_KEY")

# Initialize YouTube API client
youtube = build('youtube', 'v3', developerKey=api_key)

In [5]:
# Extract the channel ID's from the channel names
def get_channel_ids(channel_names):
    channel_data = []
    
    for channel_name in channel_names:
        request = youtube.search().list(
            q=channel_name,
            type='channel',
            part='id',
            maxResults=1
        )
        response = request.execute()
        
        if 'items' in response:
            channel_id = response['items'][0]['id']['channelId']
            channel_data.append({'channel_name': channel_name, 'channel_id': channel_id})
        else:
            print(f"Could not find channel ID for '{channel_name}'")
    
    df_channels = pd.DataFrame(channel_data)
    return df_channels

In [7]:
channel_names = [
    "TheArmbandFPL",
    "elitefpl",
    "fantasyfootballfixYT",
    "FFScout_",
    "AboveAverageFPL",
    "fplbanger",
    "fplblackbox",
    "FPLFocal"
]

channel_ids = df_channels = get_channel_ids(channel_names)

print(channel_ids)

           channel_name                channel_id
0         TheArmbandFPL  UC4UdmU9tNnU5iQVmQB3Ngvg
1              elitefpl  UCOhHIQyQg4dNKvWg0tg12zg
2  fantasyfootballfixYT  UC0Oaf88gRGnNkncI8D_GO-Q
3              FFScout_  UCKxYKQ8pgJ7V8wwh4hLsSXQ
4       AboveAverageFPL  UCnaJiRMf5hju0TlaeGK5CDQ
5             fplbanger  UC1dzUZYYluvh8ktUYFYk8PA
6           fplblackbox  UCGJ8-xqhOLwyJNuPMsVoQWQ
7              FPLFocal  UC72QokPHXQ9r98ROfNZmaDw


In [8]:
# Function to get the most recent video IDs from a channel
def get_channel_videos(df_channels, published_after, max_results=10):
    all_videos = []
    
    for _, row in df_channels.iterrows():
        channel_id = row['channel_id']
        channel_name = row['channel_name']
        videos = []
        next_page_token = None
        
        while True:
            request = youtube.search().list(
                part="id,snippet",
                channelId=channel_id,
                type="video",
                order="date",
                publishedAfter=published_after,
                maxResults=max_results,
                pageToken=next_page_token
            )
            response = request.execute()
            
            for item in response['items']:
                video_id = item['id']['videoId']
                title = item['snippet']['title']
                published_at = item['snippet']['publishedAt']
                videos.append({
                    'channel_name': channel_name,
                    'id': video_id,
                    'title': title,
                    'published_at': published_at
                })
                
                if len(videos) >= max_results:
                    break
            
            if len(videos) >= max_results:
                break
            
            next_page_token = response.get('nextPageToken')
            if not next_page_token:
                break
        
        all_videos.extend(videos)
    
    return pd.DataFrame(all_videos)


In [9]:
# Extract the last n video ID's for each of the youtube accounts
published_after = "2023-09-01T00:00:00Z" 
df_videos = get_channel_videos(df_channels, published_after, max_results=5)

print(df_videos)

            channel_name           id  \
0          TheArmbandFPL  Ah-t8j0wmsQ   
1          TheArmbandFPL  hlWD3N7iohY   
2          TheArmbandFPL  RtkI_Txs8zk   
3          TheArmbandFPL  dAZs-gQkk8A   
4          TheArmbandFPL  xQVDXFka6e8   
5               elitefpl  r3VkzwaZnSE   
6               elitefpl  tZOBHSM-g5Q   
7               elitefpl  Fvgtyurnu8k   
8               elitefpl  Xy2_A7r1yKA   
9               elitefpl  qbcn_YvFbkI   
10  fantasyfootballfixYT  JcNY6U3xW7A   
11  fantasyfootballfixYT  4nszN8HLqtI   
12  fantasyfootballfixYT  yPDDg71oLxs   
13  fantasyfootballfixYT  EUwZIkxvLtA   
14  fantasyfootballfixYT  9H5v1nH6pAY   
15              FFScout_  rJ6v57K7sgQ   
16              FFScout_  W9xLUbv_-_o   
17              FFScout_  sqQX_ZPoc0g   
18              FFScout_  zf44PnRKGig   
19              FFScout_  ZNU7iPNPGMg   
20       AboveAverageFPL  mEm14WbHwXc   
21       AboveAverageFPL  rVXFYd-BB98   
22       AboveAverageFPL  oFHBu_Hyn4I   
23       AboveAv

In [10]:
# Function to grab the transcripts if they exist
def get_transcripts(df_videos):
    transcripts = []
    for _, row in df_videos.iterrows():
        video_id = row['id']
        try:
            transcript = YouTubeTranscriptApi.get_transcript(video_id)
            full_transcript = ' '.join([entry['text'] for entry in transcript])
            transcripts.append(full_transcript)
        except Exception as e:
            transcripts.append(None)
            print(f"Error getting transcript for video {video_id}: {str(e)}")
    return transcripts

# Function to calculate word count
def word_count(transcript):
    if transcript:
        return len(transcript.split())  # Split by whitespace to count words
    return 0

In [11]:
# Fetch transcripts for df_videos
transcripts = get_transcripts(df_videos)

# Add transcripts to the DataFrame
df_videos['transcript'] = transcripts

# Add word count of the transcripts
df_videos['word_count'] = df_videos['transcript'].apply(word_count)

# Only consider transripts with more than 1000 words
df_videos = df_videos[df_videos['word_count'] >= 1000]

# Display the first few rows to verify
print(df_videos[['channel_name', 'id', 'title', 'word_count', 'transcript']])

Error getting transcript for video mEm14WbHwXc: 
Could not retrieve a transcript for the video https://www.youtube.com/watch?v=mEm14WbHwXc! This is most likely caused by:

Subtitles are disabled for this video

If you are sure that the described cause is not responsible for this error and that a transcript should be retrievable, please create an issue at https://github.com/jdepoix/youtube-transcript-api/issues. Please add which version of youtube_transcript_api you are using and provide the information needed to replicate the error. Also make sure that there are no open issues which already describe your problem!
            channel_name           id  \
0          TheArmbandFPL  Ah-t8j0wmsQ   
1          TheArmbandFPL  hlWD3N7iohY   
2          TheArmbandFPL  RtkI_Txs8zk   
3          TheArmbandFPL  dAZs-gQkk8A   
4          TheArmbandFPL  xQVDXFka6e8   
5               elitefpl  r3VkzwaZnSE   
6               elitefpl  tZOBHSM-g5Q   
7               elitefpl  Fvgtyurnu8k   
8         

In [12]:
# Function to split transcript into manageable chunks based on token count
def split_into_chunks(transcript, max_tokens=4000):
    # Tokenize the transcript
    doc = nlp(transcript)
    chunks = []
    current_chunk = []
    current_tokens = 0

    for sentence in doc.sents:
        sentence_tokens = len(sentence.orth_.split())

        if current_tokens + sentence_tokens > max_tokens:
            # If the current chunk exceeds the limit, start a new one
            chunks.append(' '.join([str(sent) for sent in current_chunk]))
            current_chunk = [sentence.text]
            current_tokens = sentence_tokens
        else:
            current_chunk.append(sentence.text)
            current_tokens += sentence_tokens

    # Append any remaining chunk
    if current_chunk:
        chunks.append(' '.join([str(sent) for sent in current_chunk]))

    return chunks

# Preprocess transcripts to reduce token count
def preprocess_transcript(text, max_tokens = 30000):
    # Convert to lowercase
    text = text.lower()
    
    # Remove text in square brackets
    text = re.sub(r'\[.*?\]', '', text)
    
    # Remove filler words
    filler_words = r'\b(um|uh|like|you know|i mean|sort of|kind of)\b'
    text = re.sub(filler_words, '', text)
    
    # Remove repeated words
    text = re.sub(r'\b(\w+)( \1\b)+', r'\1', text)
    
    # Remove speaker identifications (assuming format "Name:")
    text = re.sub(r'\w+:', '', text)
    
    # Simplify large numbers
    text = re.sub(r'\b(\d+) thousand\b', r'\1k', text)
    text = re.sub(r'\b(\d+) million\b', r'\1m', text)
    
    # Remove unnecessary punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    # Use abbreviations for common terms
    abbreviations = {
        'fantasy premier league': 'fpl',
        'gameweek': 'gw',
        'manchester united': 'man utd',
        'manchester city': 'man city'
    }
    for full, abbr in abbreviations.items():
        text = re.sub(r'\b' + full + r'\b', abbr, text)
    
    # Simplify season references
    text = re.sub(r'\d{4}/\d{4}\s+season', 'last season', text)
    
    # Simplify player names (example for Haaland)
    text = re.sub(r'\berling haaland\b', 'haaland', text)
    
    # Remove or simplify time references
    text = re.sub(r'\b\d+ minutes left\b', 'almost out of time', text)
    
    # Remove references to external content
    text = re.sub(r'(check out|tune in to|watch) .+', '', text)
    
    return text #split_into_chunks(text, max_tokens)

In [15]:
df_videos['transcript_chunks'] = df_videos['transcript'].apply(preprocess_transcript)
df_videos['word_count_process'] = df_videos['transcript_chunks'].apply(word_count)

In [16]:
# Summary of processing reduction effeciency
summary = df_videos.groupby('channel_name').agg({
    'word_count': ['mean', 'median', 'min', 'max', 'sum', 'count'],
    'word_count_process': ['mean', 'median', 'min', 'max', 'sum']
}).reset_index()

# Flatten the column names
summary.columns = ['_'.join(col).strip() for col in summary.columns.values]

# Rename the count column to 'num_videos'
summary = summary.rename(columns={'word_count_count': 'num_videos'})

# Calculate the percentage reduction in word count
summary['percent_reduction'] = (1 - summary['word_count_process_sum'] / summary['word_count_sum']) * 100

# Sort by number of videos, descending
summary = summary.sort_values('num_videos', ascending=False)

print(summary)

          channel_name_  word_count_mean  word_count_median  word_count_min  \
1              FFScout_      8891.400000             9225.0            5000   
3         TheArmbandFPL     10109.200000            10355.0            8926   
5  fantasyfootballfixYT      3505.200000             3451.0            2281   
7           fplblackbox     17130.800000            18913.0            3455   
2              FPLFocal      6768.500000             2152.0            2021   
0       AboveAverageFPL     14610.000000            16072.0            9968   
4              elitefpl     10251.333333             9695.0            9184   
6             fplbanger      7693.000000             7693.0            3850   

   word_count_max  word_count_sum  num_videos  word_count_process_mean  \
1           12982           44457           5                  2130.40   
3           11488           50546           5                  6679.40   
5            5469           17526           5                  273

In [None]:
#df_videos.to_csv(f'../output/transcripts_{published_after}.csv')

In [17]:
def save_transcripts_as_csv(df, output_dir):
    """
    Save transcripts from DataFrame as individual CSV files
    
    Parameters:
    df (pandas.DataFrame): DataFrame containing the transcripts
    output_dir (str): Directory to save the CSV files
    """
    
    # Ensure the DataFrame has the necessary columns
    required_columns = ['channel_name', 'published_at', 'transcript_chunks']
    if not all(col in df.columns for col in required_columns):
        raise ValueError(f"DataFrame is missing one or more required columns: {required_columns}")
    
    # Iterate through the DataFrame and save each transcript as a CSV file
    for _, row in df.iterrows():
        channel_name = row['channel_name']
        episode_number = row['published_at']
        transcript = row['transcript_chunks']
        
        # Create the filename
        filename = f"{channel_name}_{episode_number}.csv"
        file_path = os.path.join(output_dir, filename)
        
        # Create a new DataFrame with just the transcript
        transcript_df = pd.DataFrame({'transcript': [transcript]})
        
        # Save the transcript as a CSV file
        transcript_df.to_csv(file_path, index=False)
        
        print(f"Saved: {filename}")

output_directory = "../output/"
save_transcripts_as_csv(df_videos, output_directory)

Saved: TheArmbandFPL_2024-08-29T05:03:00Z.csv
Saved: TheArmbandFPL_2024-08-22T05:11:46Z.csv
Saved: TheArmbandFPL_2024-08-16T05:41:11Z.csv
Saved: TheArmbandFPL_2024-08-09T06:04:17Z.csv
Saved: TheArmbandFPL_2024-07-18T05:02:30Z.csv
Saved: elitefpl_2024-09-05T21:10:41Z.csv
Saved: elitefpl_2024-09-01T20:40:54Z.csv
Saved: elitefpl_2024-08-31T20:33:06Z.csv
Saved: fantasyfootballfixYT_2024-09-06T12:00:31Z.csv
Saved: fantasyfootballfixYT_2024-09-05T10:00:04Z.csv
Saved: fantasyfootballfixYT_2024-08-30T12:01:10Z.csv
Saved: fantasyfootballfixYT_2024-08-28T11:00:21Z.csv
Saved: fantasyfootballfixYT_2024-08-26T11:00:12Z.csv
Saved: FFScout__2024-09-06T16:30:09Z.csv
Saved: FFScout__2024-09-05T18:35:50Z.csv
Saved: FFScout__2024-09-04T12:27:42Z.csv
Saved: FFScout__2024-09-03T21:33:35Z.csv
Saved: FFScout__2024-09-03T12:40:00Z.csv
Saved: AboveAverageFPL_2024-09-04T18:27:04Z.csv
Saved: AboveAverageFPL_2024-09-01T21:27:37Z.csv
Saved: AboveAverageFPL_2024-08-29T18:16:03Z.csv
Saved: fplbanger_2024-08-28T13:14

In [None]:
import torch
from transformers import BartForConditionalGeneration, BartTokenizer

def summarize_with_bart(text, max_length=150, min_length=50):
    # Check if MPS is available and set the device
    if torch.backends.mps.is_available():
        device = torch.device("mps")
    else:
        device = torch.device("cpu")
    print(f"Using device: {device}")

    try:
        # Load pre-trained model and tokenizer
        model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn').to(device)
        tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

        # Tokenize the input text
        inputs = tokenizer([text], max_length=1024, return_tensors='pt', truncation=True)
        
        # Move input tensors to the correct device
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Generate summary
        summary_ids = model.generate(inputs['input_ids'],
                                     num_beams=4,
                                     max_length=max_length,
                                     min_length=min_length,
                                     length_penalty=2.0,
                                     early_stopping=True)

        # Decode the generated summary
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

        return summary

    except Exception as e:
        print(f"An error occurred during summarization: {str(e)}")
        return None

In [None]:
df = df_videos
summary = summarize_with_bart(df['transcript_chunks'])
if summary:
    print(summary)
else:
    print("Summarization failed. Please check the error message above.")

In [18]:
import os
import json
from collections import defaultdict
from anthropic import Anthropic

# Initialize the Anthropic client
anthropic = Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY"))

def summarize_transcript(transcript):
    """
    Summarize a single transcript using Claude API
    """
    prompt = f"""
    Please summarize the following Fantasy Premier League podcast transcript. 
    Focus on the key points, player recommendations, and strategy advice.
    Limit the summary to 3-5 bullet points.

    Transcript:
    {transcript}

    Summary:
    """

    response = anthropic.messages.create(
        model="claude-3-5-sonnet-20240620",
        max_tokens=1000,
        temperature=0.7,
        system="You are an expert in Fantasy Premier League and podcast summarization.",
        messages=[
            {"role": "user", "content": prompt}
        ]
    )

    return response.content

def process_transcripts(transcripts_dir):
    """
    Process all transcripts in the given directory and group summaries by channel name
    """
    summaries = defaultdict(list)

    for filename in os.listdir(transcripts_dir):
        if filename.endswith(".csv"):
            channel_name = filename.split("_")[0]  # Assuming filename format: channelname_episode.txt
            
            with open(os.path.join(transcripts_dir, filename), "r") as file:
                transcript = file.read()
            
            summary = summarize_transcript(transcript)
            summaries[channel_name].append(summary)

    return summaries

def save_summaries(summaries, output_file):
    """
    Save the grouped summaries to a JSON file
    """
    with open(output_file, "w") as f:
        json.dump(summaries, f, indent=2)

In [None]:
transcripts_dir = "../output/"
output_file = "fpl_podcast_summaries.json"

summaries = process_transcripts(transcripts_dir)

In [22]:
import json

def convert_to_serializable(obj):
    """
    Convert non-serializable objects to serializable format.
    """
    if isinstance(obj, (str, int, float, bool, type(None))):
        return obj
    elif isinstance(obj, (list, tuple)):
        return [convert_to_serializable(item) for item in obj]
    elif isinstance(obj, dict):
        return {key: convert_to_serializable(value) for key, value in obj.items()}
    else:
        # For any other type, convert to string
        return str(obj)

def save_summaries(summaries, output_file):
    """
    Save the grouped summaries to a JSON file
    """
    serializable_summaries = convert_to_serializable(summaries)
    with open(output_file, "w", encoding='utf-8') as f:
        json.dump(serializable_summaries, f, indent=2, ensure_ascii=False)
    print(f"Summaries saved to {output_file}")

In [23]:
save_summaries(summaries, "fpl_podcast_summaries.json")

Summaries saved to fpl_podcast_summaries.json
