In [1]:
from googleapiclient.discovery import build
from youtube_transcript_api import YouTubeTranscriptApi
import pandas as pd
import re
import os

import numpy as np
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize
import nltk
import spacy


In [2]:
from dotenv import load_dotenv

# Load YouTube API key
load_dotenv()

api_key = os.getenv("API_KEY")

# Initialize YouTube API client
youtube = build('youtube', 'v3', developerKey=api_key)

In [3]:
# Extract the channel ID's from the channel names
def get_channel_ids(channel_names):
    channel_data = []
    
    for channel_name in channel_names:
        request = youtube.search().list(
            q=channel_name,
            type='channel',
            part='id',
            maxResults=1
        )
        response = request.execute()
        
        if 'items' in response:
            channel_id = response['items'][0]['id']['channelId']
            channel_data.append({'channel_name': channel_name, 'channel_id': channel_id})
        else:
            print(f"Could not find channel ID for '{channel_name}'")
    
    df_channels = pd.DataFrame(channel_data)
    return df_channels

In [5]:
channel_names = [
    "TheArmbandFPL",
    "elitefpl",
    "fantasyfootballfixYT",
    "FFScout_",
    "AboveAverageFPL",
    "fplbanger",
    "fplblackbox",
    "FPLFocal",
    "alwayscheating",
    "FMLFPL",
    "FPLBlackBox"
]

channel_ids = df_channels = get_channel_ids(channel_names)

print(channel_ids)

            channel_name                channel_id
0          TheArmbandFPL  UC4UdmU9tNnU5iQVmQB3Ngvg
1               elitefpl  UCOhHIQyQg4dNKvWg0tg12zg
2   fantasyfootballfixYT  UC0Oaf88gRGnNkncI8D_GO-Q
3               FFScout_  UCKxYKQ8pgJ7V8wwh4hLsSXQ
4        AboveAverageFPL  UCnaJiRMf5hju0TlaeGK5CDQ
5              fplbanger  UC1dzUZYYluvh8ktUYFYk8PA
6            fplblackbox  UCGJ8-xqhOLwyJNuPMsVoQWQ
7               FPLFocal  UC72QokPHXQ9r98ROfNZmaDw
8         alwayscheating  UChLRgtHvvYCXWwJFDWmpv8Q
9                 FMLFPL  UCZikELJczbLKc_40syGKyxg
10           FPLBlackBox  UCGJ8-xqhOLwyJNuPMsVoQWQ


In [6]:
# Function to get the most recent video IDs from a channel
def get_channel_videos(df_channels, published_after, max_results=10):
    all_videos = []
    
    for _, row in df_channels.iterrows():
        channel_id = row['channel_id']
        channel_name = row['channel_name']
        videos = []
        next_page_token = None
        
        while True:
            request = youtube.search().list(
                part="id,snippet",
                channelId=channel_id,
                type="video",
                order="date",
                publishedAfter=published_after,
                maxResults=max_results,
                pageToken=next_page_token
            )
            response = request.execute()
            
            for item in response['items']:
                video_id = item['id']['videoId']
                title = item['snippet']['title']
                published_at = item['snippet']['publishedAt']
                videos.append({
                    'channel_name': channel_name,
                    'id': video_id,
                    'title': title,
                    'published_at': published_at
                })
                
                if len(videos) >= max_results:
                    break
            
            if len(videos) >= max_results:
                break
            
            next_page_token = response.get('nextPageToken')
            if not next_page_token:
                break
        
        all_videos.extend(videos)
    
    return pd.DataFrame(all_videos)


In [7]:
# Extract the last n video ID's for each of the youtube accounts
published_after = "2024-09-01T00:00:00Z" 
df_videos = get_channel_videos(df_channels, published_after, max_results=5)

print(df_videos)

            channel_name           id  \
0          TheArmbandFPL  kqgIuTxdrdo   
1          TheArmbandFPL  GUVhMwx6oMI   
2               elitefpl  pbpK3bPss9Q   
3               elitefpl  r3VkzwaZnSE   
4               elitefpl  tZOBHSM-g5Q   
5   fantasyfootballfixYT  dOewUlqqdAk   
6   fantasyfootballfixYT  OyxDarrEgJ8   
7   fantasyfootballfixYT  JqYYXLEJpd4   
8   fantasyfootballfixYT  JcNY6U3xW7A   
9   fantasyfootballfixYT  4nszN8HLqtI   
10              FFScout_  jdSTS01bMSs   
11              FFScout_  Lr9B4aNvLn0   
12              FFScout_  jfq0eHtXsOQ   
13              FFScout_  mq6v-RP0IlY   
14              FFScout_  16_aM49v5Y4   
15       AboveAverageFPL  vqDCHu7ctdE   
16       AboveAverageFPL  Z0ZTwV5hkUE   
17       AboveAverageFPL  UZmioRRUiBc   
18       AboveAverageFPL  mEm14WbHwXc   
19       AboveAverageFPL  rVXFYd-BB98   
20             fplbanger  3duJT4qFGsk   
21           fplblackbox  Ymiink3oVcQ   
22           fplblackbox  l0oClUyq9nk   
23              

In [8]:
# Function to grab the transcripts if they exist
def get_transcripts(df_videos):
    transcripts = []
    for _, row in df_videos.iterrows():
        video_id = row['id']
        try:
            transcript = YouTubeTranscriptApi.get_transcript(video_id, languages = ['en'])
            full_transcript = ' '.join([entry['text'] for entry in transcript])
            transcripts.append(full_transcript)
        except Exception as e:
            transcripts.append(None)
            print(f"Error getting transcript for video {video_id}: {str(e)}")
    return transcripts

# Function to calculate word count
def word_count(transcript):
    if transcript:
        return len(transcript.split())  # Split by whitespace to count words
    return 0

In [12]:
# Fetch transcripts for df_videos
transcripts = get_transcripts(df_videos)

# Add transcripts to the DataFrame
df_videos['transcript'] = transcripts

# Add word count of the transcripts
df_videos['word_count'] = df_videos['transcript'].apply(word_count)

# Only consider transripts with more than 1000 words
df_videos = df_videos[df_videos['word_count'] >= 1000]

# Display the first few rows to verify
print(df_videos[['channel_name', 'id', 'title', 'word_count', 'transcript']])

Error getting transcript for video jdSTS01bMSs: 
Could not retrieve a transcript for the video https://www.youtube.com/watch?v=jdSTS01bMSs! This is most likely caused by:

Subtitles are disabled for this video

If you are sure that the described cause is not responsible for this error and that a transcript should be retrievable, please create an issue at https://github.com/jdepoix/youtube-transcript-api/issues. Please add which version of youtube_transcript_api you are using and provide the information needed to replicate the error. Also make sure that there are no open issues which already describe your problem!
Error getting transcript for video mEm14WbHwXc: no element found: line 1, column 0
            channel_name           id  \
0          TheArmbandFPL  kqgIuTxdrdo   
1          TheArmbandFPL  GUVhMwx6oMI   
2               elitefpl  pbpK3bPss9Q   
3               elitefpl  r3VkzwaZnSE   
4               elitefpl  tZOBHSM-g5Q   
5   fantasyfootballfixYT  dOewUlqqdAk   
6   fanta

In [13]:
# Function to split transcript into manageable chunks based on token count
def split_into_chunks(transcript, max_tokens=4000):
    # Tokenize the transcript
    doc = nlp(transcript)
    chunks = []
    current_chunk = []
    current_tokens = 0

    for sentence in doc.sents:
        sentence_tokens = len(sentence.orth_.split())

        if current_tokens + sentence_tokens > max_tokens:
            # If the current chunk exceeds the limit, start a new one
            chunks.append(' '.join([str(sent) for sent in current_chunk]))
            current_chunk = [sentence.text]
            current_tokens = sentence_tokens
        else:
            current_chunk.append(sentence.text)
            current_tokens += sentence_tokens

    # Append any remaining chunk
    if current_chunk:
        chunks.append(' '.join([str(sent) for sent in current_chunk]))

    return chunks

# Preprocess transcripts to reduce token count
def preprocess_transcript(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove text in square brackets
    text = re.sub(r'\[.*?\]', '', text)
    
    # Remove filler words
    filler_words = r'\b(basically|um|umm|uh|oh|yeah|actually|literally|obviously|you know|I mean|I guess|but you know|I suppose|or something|really|very much|sort of|kind of)\b'
    text = re.sub(filler_words, '', text, flags=re.IGNORECASE)
    
    # Remove repeated words
    text = re.sub(r'\b(\w+)( \1\b)+', r'\1', text)
    
    # Simplify large numbers
    text = re.sub(r'\b(\d+) thousand\b', r'\1k', text)
    text = re.sub(r'\b(\d+) million\b', r'\1m', text)
    
    # Remove unnecessary punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    # Use abbreviations for common terms
    abbreviations = {
        'fantasy premier league': 'fpl',
        'gameweek': 'gw',
        'manchester united': 'man utd',
        'manchester city': 'man city'
    }
    for full, abbr in abbreviations.items():
        text = re.sub(r'\b' + full + r'\b', abbr, text)
    
    # Simplify season references
    text = re.sub(r'\d{4}/\d{4}\s+season', 'last season', text)
    
    # Simplify player names (example for Haaland)
    text = re.sub(r'\berling haaland\b', 'haaland', text)
    
    return text

In [14]:
df_videos['transcript_chunks'] = df_videos['transcript'].apply(preprocess_transcript)
df_videos['word_count_process'] = df_videos['transcript_chunks'].apply(word_count)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_videos['transcript_chunks'] = df_videos['transcript'].apply(preprocess_transcript)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_videos['word_count_process'] = df_videos['transcript_chunks'].apply(word_count)


In [15]:
# Summary of processing reduction effeciency
summary = df_videos.groupby('channel_name').agg({
    'word_count': ['mean', 'median', 'min', 'max', 'sum', 'count'],
    'word_count_process': ['mean', 'median', 'min', 'max', 'sum']
}).reset_index()

# Flatten the column names
summary.columns = ['_'.join(col).strip() for col in summary.columns.values]

# Rename the count column to 'num_videos'
summary = summary.rename(columns={'word_count_count': 'num_videos'})

# Calculate the percentage reduction in word count
summary['percent_reduction'] = (1 - summary['word_count_process_sum'] / summary['word_count_sum']) * 100

# Sort by number of videos, descending
summary = summary.sort_values('num_videos', ascending=False)

print(summary)

           channel_name_  word_count_mean  word_count_median  word_count_min  \
8   fantasyfootballfixYT           3881.0             3025.0            2772   
6         alwayscheating          15940.5            15981.0           13413   
7               elitefpl          10317.0             9695.0            9184   
0        AboveAverageFPL          14401.0            14401.0           12730   
1               FFScout_           9945.5             9945.5            9890   
3            FPLBlackBox          15252.0            15252.0            2607   
4               FPLFocal           2209.0             2209.0            1990   
5          TheArmbandFPL          13227.0            13227.0           13160   
10           fplblackbox          15252.0            15252.0            2607   
2                 FMLFPL          18514.0            18514.0           18514   
9              fplbanger          10304.0            10304.0           10304   

    word_count_max  word_count_sum  num

In [16]:
df_videos.to_csv(f'../output/transcripts_{published_after}.csv')

In [17]:
def save_transcripts_as_csv(df, output_dir):
    """
    Save transcripts from DataFrame as individual CSV files
    
    Parameters:
    df (pandas.DataFrame): DataFrame containing the transcripts
    output_dir (str): Directory to save the CSV files
    """
    
    # Ensure the DataFrame has the necessary columns
    required_columns = ['channel_name', 'published_at', 'transcript_chunks']
    if not all(col in df.columns for col in required_columns):
        raise ValueError(f"DataFrame is missing one or more required columns: {required_columns}")
    
    # Iterate through the DataFrame and save each transcript as a CSV file
    for _, row in df.iterrows():
        channel_name = row['channel_name']
        episode_number = row['published_at']
        transcript = row['transcript_chunks']
        
        # Create the filename
        filename = f"{channel_name}_{episode_number}.csv"
        file_path = os.path.join(output_dir, filename)
        
        # Create a new DataFrame with just the transcript
        transcript_df = pd.DataFrame({'transcript': [transcript]})
        
        # Save the transcript as a CSV file
        transcript_df.to_csv(file_path, index=False)
        
        print(f"Saved: {filename}")

output_directory = "../output/transcripts/"
save_transcripts_as_csv(df_videos, output_directory)

Saved: TheArmbandFPL_2024-09-12T05:30:55Z.csv
Saved: TheArmbandFPL_2024-09-11T18:29:52Z.csv
Saved: elitefpl_2024-09-12T06:59:47Z.csv
Saved: elitefpl_2024-09-05T21:10:41Z.csv
Saved: elitefpl_2024-09-01T20:40:54Z.csv
Saved: fantasyfootballfixYT_2024-09-13T09:33:25Z.csv
Saved: fantasyfootballfixYT_2024-09-12T09:45:01Z.csv
Saved: fantasyfootballfixYT_2024-09-11T10:49:37Z.csv
Saved: fantasyfootballfixYT_2024-09-06T12:00:31Z.csv
Saved: fantasyfootballfixYT_2024-09-05T10:00:04Z.csv
Saved: FFScout__2024-09-13T09:53:21Z.csv
Saved: FFScout__2024-09-12T16:13:02Z.csv
Saved: AboveAverageFPL_2024-09-12T21:17:58Z.csv
Saved: AboveAverageFPL_2024-09-04T18:27:04Z.csv
Saved: fplbanger_2024-09-10T09:54:46Z.csv
Saved: fplblackbox_2024-09-12T23:01:54Z.csv
Saved: fplblackbox_2024-09-08T21:18:52Z.csv
Saved: FPLFocal_2024-09-13T12:23:02Z.csv
Saved: FPLFocal_2024-09-12T10:49:50Z.csv
Saved: alwayscheating_2024-09-09T13:32:22Z.csv
Saved: alwayscheating_2024-09-09T03:05:50Z.csv
Saved: alwayscheating_2024-09-03T12:

In [None]:
import torch
from transformers import BartForConditionalGeneration, BartTokenizer

def summarize_with_bart(text, max_length=150, min_length=50):
    # Check if MPS is available and set the device
    if torch.backends.mps.is_available():
        device = torch.device("mps")
    else:
        device = torch.device("cpu")
    print(f"Using device: {device}")

    try:
        # Load pre-trained model and tokenizer
        model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn').to(device)
        tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

        # Tokenize the input text
        inputs = tokenizer([text], max_length=1024, return_tensors='pt', truncation=True)
        
        # Move input tensors to the correct device
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Generate summary
        summary_ids = model.generate(inputs['input_ids'],
                                     num_beams=4,
                                     max_length=max_length,
                                     min_length=min_length,
                                     length_penalty=2.0,
                                     early_stopping=True)

        # Decode the generated summary
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

        return summary

    except Exception as e:
        print(f"An error occurred during summarization: {str(e)}")
        return None

In [None]:
# disabled for now, summarising with Claude 3.5
#summary = summarize_with_bart(df['transcript_chunks'])

In [14]:
import os
import json
from collections import defaultdict
from anthropic import Anthropic

# Initialize the Anthropic client
anthropic = Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY"))

def summarize_transcript(transcript):
    """
    Summarize a single transcript using Claude API
    """
    prompt = f"""
    Please summarize the following Fantasy Premier League podcast transcript. 
    Focus on the key points, player recommendations, and strategy advice.
    Limit the summary to 3-5 bullet points.

    Transcript:
    {transcript}

    Summary:
    """

    response = anthropic.messages.create(
        model="claude-3-5-sonnet-20240620",
        max_tokens=1000,
        temperature=0.5,
        system="You are an expert in Fantasy Premier League and podcast summarization.",
        messages=[
            {"role": "user", "content": prompt}
        ]
    )

    return response.content

def process_transcripts(transcripts_dir):
    """
    Process all transcripts in the given directory and group summaries by channel name
    """
    summaries = defaultdict(list)

    for filename in os.listdir(transcripts_dir):
        if filename.endswith(".csv"):
            channel_name = filename.split("_")[0]
            
            with open(os.path.join(transcripts_dir, filename), "r") as file:
                transcript = file.read()
            
            summary = summarize_transcript(transcript)
            summaries[channel_name].append(summary)

    return summaries

def save_summaries(summaries, output_file):
    """
    Save the grouped summaries to a JSON file
    """
    with open(output_file, "w") as f:
        json.dump(summaries, f, indent=2)

In [15]:
transcripts_dir = "../output/transcripts/"
output_file = "fpl_podcast_summaries.json"

summaries = process_transcripts(transcripts_dir)

In [16]:
import json

def convert_to_serializable(obj):
    """
    Convert non-serializable objects to serializable format.
    """
    if isinstance(obj, (str, int, float, bool, type(None))):
        return obj
    elif isinstance(obj, (list, tuple)):
        return [convert_to_serializable(item) for item in obj]
    elif isinstance(obj, dict):
        return {key: convert_to_serializable(value) for key, value in obj.items()}
    else:
        # For any other type, convert to string
        return str(obj)

def save_summaries(summaries, output_file):
    """
    Save the grouped summaries to a JSON file
    """
    serializable_summaries = convert_to_serializable(summaries)
    with open(output_file, "w", encoding='utf-8') as f:
        json.dump(serializable_summaries, f, indent=2, ensure_ascii=False)
    print(f"Summaries saved to {output_file}")

In [17]:
save_summaries(summaries, "../output/fpl_podcast_summaries.json")

Summaries saved to ../output/fpl_podcast_summaries.json


In [19]:
def clean_text(text):
    # Remove TextBlock wrapper and unescape newlines
    text = re.sub(r'TextBlock\(text=|,\s*type=\'text\'\)', '', text)
    text = text.strip("'")
    # Replace escaped newlines with actual newlines
    text = text.replace('\\n', '\n')
    # Remove the introductory sentence
    pattern = r"Here's a summary of the key points from the Fantasy Premier League podcast transcript:[\s\n]*"
    text = re.sub(pattern, '', text, flags=re.IGNORECASE)
    return text.strip()

def is_summarizable(text):
    unsummarizable_patterns = [
        "I apologize, but I cannot provide a meaningful summary",
        "I'm sorry, but I can't provide a summary",
        "Unable to provide a summary"
    ]
    cleaned_text = clean_text(text).lower()
    return not any(pattern.lower() in cleaned_text for pattern in unsummarizable_patterns)

def filter_json(json_data):
    filtered_data = {}
    for channel, summaries in json_data.items():
        filtered_summaries = [summary for summary in summaries if is_summarizable(summary[0])]
        if filtered_summaries:
            filtered_data[channel] = filtered_summaries
    return filtered_data

def json_to_markdown(json_data):
    markdown = ""
    
    for channel, summaries in json_data.items():
        markdown += f"# {channel}\n\n"
        
        for summary in summaries:
            cleaned_summary = clean_text(summary[0])
            # Add two spaces at the end of each line for Markdown line breaks
            cleaned_summary = '\n'.join(line.rstrip() + '  ' for line in cleaned_summary.split('\n'))
            markdown += cleaned_summary + "\n\n"
        
        markdown += "---\n\n"
    
    return markdown.strip()

In [20]:
# Read the JSON file
with open('../output/fpl_podcast_summaries.json', 'r') as file:
    data = json.load(file)

# Filter out unsummarizable transcripts
filtered_data = filter_json(data)

# Convert filtered JSON to Markdown
markdown_content = json_to_markdown(filtered_data)

# Write the Markdown content to a file
with open('../output/fpl_podcast_summaries.md', 'w', encoding='utf-8') as file:
    file.write(markdown_content)

print("Conversion complete. Markdown file 'fpl_podcast_summaries.md' has been created.")

# Optionally, save the filtered JSON
with open('../output/filtered_fpl_podcast_summaries.json', 'w', encoding='utf-8') as file:
    json.dump(filtered_data, file, indent=2, ensure_ascii=False)

print("Filtered JSON file 'filtered_fpl_podcast_summaries.json' has been created.")

Conversion complete. Markdown file 'fpl_podcast_summaries.md' has been created.
Filtered JSON file 'filtered_fpl_podcast_summaries.json' has been created.


In [None]:
import pandas as pd
import re
from collections import Counter

def get_word_frequencies(text, min_length=1):
    # Convert to lowercase and split into words
    words = re.findall(r'\b[\w\']+\b', text.lower())
    
    # Filter words by minimum length if specified
    if min_length > 1:
        words = [word for word in words if len(word) >= min_length]
    
    # Count word frequencies
    word_freq = Counter(words)
    
    return word_freq

# Combine all transcripts into a single string
all_transcripts = ' '.join(df_videos['transcript'])

# Get word frequencies
word_frequencies = get_word_frequencies(all_transcripts)

# Get the 100 most common words
most_common_words = word_frequencies.most_common(100)

# Print the results
print("100 Most Common Words:")
for word, count in most_common_words:
    print(f"{word}: {count}")

