In [None]:
!pip install google-api-python-client youtube-transcript-api pandas tqdm


In [None]:

from youtube_transcript_api import YouTubeTranscriptApi
import pandas as pd
import time

def fetch_transcripts_simple(csv_file_path):
    """Simple version that saves only videoId and transcript columns"""
    # Read the original CSV
    df = pd.read_csv(csv_file_path)
    video_ids = df['videoId'].tolist()
   
    print(f"📁 Processing {len(video_ids)} videos from {csv_file_path}")
   
    transcripts = []
    available = []  # <-- Added for availability tracking

    for i, video_id in enumerate(video_ids, 1):
        print(f"🔄 Processing {i}/{len(video_ids)}: {video_id}")
       
        try:
            api = YouTubeTranscriptApi()
            result = api.fetch(video_id, languages=['en'])
            time.sleep(3) 
            text = ' '.join([snippet.text for snippet in result.snippets])
            text = ' '.join(text.split())  # Clean up
            transcripts.append(text)
            available.append(True)   # <-- Mark transcript available
            print("✅ Success")
        except Exception as e:
            print(f"❌ Failed: {e}")
            transcripts.append(None)
            available.append(False)  # <-- Mark transcript unavailable
       
        time.sleep(1)  # Be nice to YouTube
   
    # Create new DataFrame with only videoId and transcript
    result_df = pd.DataFrame({
        'videoId': video_ids,
        'transcript': transcripts
    })
   
    # Save only these two columns
    output_file = csv_file_path.replace('.csv', '_transcripts_only.csv')
    result_df.to_csv(output_file, index=False)
    print(f"💾 Saved to: {output_file}")
    print(f"📊 File contains: {len(result_df)} rows, 2 columns (videoId, transcript)")

    # ✅ Add transcript + availability columns to uploaded CSV
    df['transcript'] = transcripts
    df['is_transcript_available'] = available
    df.to_csv(csv_file_path, index=False)
    print(f"✅ Updated original CSV with new columns: transcript, is_transcript_available")
   
    return result_df

# Usage
csv_file_path = "G:\\infosys_internship\\transcipt\\ytdetails_transcript.csv"  # Your CSV file path
fetch_transcripts_simple(csv_file_path)


📁 Processing 51 videos from G:\infosys_internship\transcipt\ytdetails_transcript.csv
🔄 Processing 1/51: bshe96X5KkA
✅ Success
🔄 Processing 2/51: PtETUYa3i2Q
✅ Success
🔄 Processing 3/51: V7TMkZH1AkM
✅ Success
🔄 Processing 4/51: h4FRpDGuJyI
✅ Success
🔄 Processing 5/51: EWd3_I4X32g
✅ Success
🔄 Processing 6/51: M5B-xBmznok
✅ Success
🔄 Processing 7/51: Iqz7BWePOys
✅ Success
🔄 Processing 8/51: v73-ps01c5w
✅ Success
🔄 Processing 9/51: gJrjgg1KVL4
✅ Success
🔄 Processing 10/51: lvg8pBJ2kFg
✅ Success
🔄 Processing 11/51: K5KVEU3aaeQ
✅ Success
🔄 Processing 12/51: NClmyC6olC0
✅ Success
🔄 Processing 13/51: v7BNtpw53AA
✅ Success
🔄 Processing 14/51: gdiao7L9GjE
✅ Success
🔄 Processing 15/51: zVEH7Zt1j9A
❌ Failed: 
Could not retrieve a transcript for the video https://www.youtube.com/watch?v=zVEH7Zt1j9A! This is most likely caused by:

YouTube is blocking requests from your IP. This usually is due to one of the following reasons:
- You have done too many requests and your IP has been blocked by YouTube
