In [1]:
!pip install google-api-python-client youtube-transcript-api pandas tqdm





[notice] A new release of pip is available: 25.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:

from youtube_transcript_api import YouTubeTranscriptApi
import pandas as pd
import time

def fetch_transcripts_simple(csv_file_path):
    """Simple version that saves only videoId and transcript columns"""
    # Read the original CSV
    df = pd.read_csv(csv_file_path)
    video_ids = df['videoId'].tolist()
   
    print(f"üìÅ Processing {len(video_ids)} videos from {csv_file_path}")
   
    transcripts = []
    available = []  # <-- Added for availability tracking

    for i, video_id in enumerate(video_ids, 1):
        print(f"üîÑ Processing {i}/{len(video_ids)}: {video_id}")
       
        try:
            api = YouTubeTranscriptApi()
            result = api.fetch(video_id, languages=['en'])
            time.sleep(3) 
            text = ' '.join([snippet.text for snippet in result.snippets])
            text = ' '.join(text.split())  # Clean up
            transcripts.append(text)
            available.append(True)   # <-- Mark transcript available
            print("‚úÖ Success")
        except Exception as e:
            print(f"‚ùå Failed: {e}")
            transcripts.append(None)
            available.append(False)  # <-- Mark transcript unavailable
       
        time.sleep(1)  # Be nice to YouTube
   
    # Create new DataFrame with only videoId and transcript
    result_df = pd.DataFrame({
        'videoId': video_ids,
        'transcript': transcripts
    })
   
    # Save only these two columns
    output_file = csv_file_path.replace('.csv', '_transcripts_only.csv')
    result_df.to_csv(output_file, index=False)
    print(f"üíæ Saved to: {output_file}")
    print(f"üìä File contains: {len(result_df)} rows, 2 columns (videoId, transcript)")

    # ‚úÖ Add transcript + availability columns to uploaded CSV
    df['transcript'] = transcripts
    df['is_transcript_available'] = available
    df.to_csv(csv_file_path, index=False)
    print(f"‚úÖ Updated original CSV with new columns: transcript, is_transcript_available")
   
    return result_df

# Usage
csv_file_path = "ytdetails_transcript.csv"  # Your CSV file path
fetch_transcripts_simple(csv_file_path)


üìÅ Processing 51 videos from ytdetails_transcript.csv
üîÑ Processing 1/51: bshe96X5KkA
‚úÖ Success
üîÑ Processing 2/51: PtETUYa3i2Q
‚úÖ Success
üîÑ Processing 3/51: V7TMkZH1AkM
‚úÖ Success
üîÑ Processing 4/51: h4FRpDGuJyI
‚úÖ Success
üîÑ Processing 5/51: EWd3_I4X32g
‚úÖ Success
üîÑ Processing 6/51: M5B-xBmznok
‚úÖ Success
üîÑ Processing 7/51: Iqz7BWePOys
‚úÖ Success
üîÑ Processing 8/51: v73-ps01c5w
‚úÖ Success
üîÑ Processing 9/51: gJrjgg1KVL4
‚úÖ Success
üîÑ Processing 10/51: lvg8pBJ2kFg
‚úÖ Success
üîÑ Processing 11/51: K5KVEU3aaeQ
‚úÖ Success
üîÑ Processing 12/51: NClmyC6olC0
‚úÖ Success
üîÑ Processing 13/51: v7BNtpw53AA
‚úÖ Success
üîÑ Processing 14/51: gdiao7L9GjE
‚úÖ Success
üîÑ Processing 15/51: zVEH7Zt1j9A
‚úÖ Success
üîÑ Processing 16/51: fZmQ_aPXJAM
‚ùå Failed: 
Could not retrieve a transcript for the video https://www.youtube.com/watch?v=fZmQ_aPXJAM! This is most likely caused by:

YouTube is blocking requests from your IP. This usually is due to one of the 

Unnamed: 0,videoId,transcript
0,bshe96X5KkA,"[Music] AI is everywhere, but have you actuall..."
1,PtETUYa3i2Q,"[Music] AI is everywhere, but have you actuall..."
2,V7TMkZH1AkM,"[Music] Hey friends, I've got something exciti..."
3,h4FRpDGuJyI,"Microsoft just laid off 6,000 employees and th..."
4,EWd3_I4X32g,[Music] Welcome to part two of the ultimate Sp...
5,M5B-xBmznok,"Hey guys, many of you have been waiting for pa..."
6,Iqz7BWePOys,hey everyone msh here a few days ago I publish...
7,v73-ps01c5w,hey everyone msh here a few days ago I publish...
8,gJrjgg1KVL4,[Music] welcome to the ultimate spring boot co...
9,lvg8pBJ2kFg,hi guys many of you have been asking me for a ...
