In [1]:
!pip install youtube-comment-downloader
!pip install gTTS textblob jiwer
!pip install git+https://github.com/openai/whisper.git
!sudo apt update && sudo apt install ffmpeg


Collecting youtube-comment-downloader
  Downloading youtube_comment_downloader-0.1.76-py3-none-any.whl.metadata (2.9 kB)
Collecting dateparser (from youtube-comment-downloader)
  Downloading dateparser-1.2.1-py3-none-any.whl.metadata (29 kB)
Downloading youtube_comment_downloader-0.1.76-py3-none-any.whl (8.2 kB)
Downloading dateparser-1.2.1-py3-none-any.whl (295 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.7/295.7 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dateparser, youtube-comment-downloader
Successfully installed dateparser-1.2.1 youtube-comment-downloader-0.1.76
Collecting gTTS
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Collecting jiwer
  Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading gTTS-2.5.4-py3-none-any.whl (29 kB)
Downloadin

In [2]:
from youtube_comment_downloader import YoutubeCommentDownloader
from gtts import gTTS
import pandas as pd
import os
import csv
import whisper
from textblob import TextBlob
from jiwer import wer
from tqdm import tqdm


In [3]:
video_urls = [
    'https://youtu.be/Xt4cMYg43cA?si=Zx3pejf-L3eNAuJr',
    'https://youtu.be/7ARBJQn6QkM?si=Si3sZxWHTkZZpuRO',
    'https://youtu.be/_jl64f-821o?si=xqrQja-MW2-ntKkf'
]

downloader = YoutubeCommentDownloader()
count = 0
max_comments = 500

with open('youtube_comments.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['Comment'])

    for url in video_urls:
        print(f"🔍 Scraping: {url}")
        try:
            comments = downloader.get_comments_from_url(url, sort_by=0, sleep=1.5)
            for comment in comments:
                text = comment.get('text', '').strip()
                if len(text) > 0:
                    writer.writerow([text])
                    count += 1
                    if count >= max_comments:
                        break
            if count >= max_comments:
                break
        except Exception as e:
            print(f"⚠️ Error scraping {url} — {e}")

print(f"✅ Total comments collected: {count}")


🔍 Scraping: https://youtu.be/Xt4cMYg43cA?si=Zx3pejf-L3eNAuJr
✅ Total comments collected: 500


In [4]:
df = pd.read_csv('youtube_comments.csv')
os.makedirs('audio_clips', exist_ok=True)

generated_data = []

for i, row in df.iterrows():
    text = str(row['Comment']).strip()
    if len(text) < 5:
        continue

    filename = f"clip_{i}.mp3"
    path = os.path.join('audio_clips', filename)

    tts = gTTS(text)
    tts.save(path)

    generated_data.append({'Filename': filename, 'Original': text})

pd.DataFrame(generated_data).to_csv('generated_audio_data.csv', index=False)
print("✅ Audio files saved in 'audio_clips/'")


✅ Audio files saved in 'audio_clips/'


In [5]:
model = whisper.load_model("tiny")
print("✅ Whisper model loaded")


100%|█████████████████████████████████████| 72.1M/72.1M [00:01<00:00, 54.4MiB/s]


✅ Whisper model loaded


In [6]:
generated_data = pd.read_csv('generated_audio_data.csv').to_dict(orient='records')
transcriptions = []

for item in tqdm(generated_data[:100], desc="Transcribing"):
    file_path = os.path.join("audio_clips", item['Filename'])
    result = model.transcribe(file_path)
    transcriptions.append({
        'Filename': item['Filename'],
        'Transcription': result['text']
    })

pd.DataFrame(transcriptions).to_csv("transcribed_results_100.csv", index=False)
print("✅ First 100 transcriptions saved")


Transcribing: 100%|██████████| 100/100 [00:56<00:00,  1.78it/s]

✅ First 100 transcriptions saved





In [9]:
!python -m textblob.download_corpora


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Unzipping corpora/conll2000.zip.
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
Finished.


In [10]:
transcribed_df = pd.read_csv("transcribed_results_100.csv")
sample_text = transcribed_df['Transcription'][0]
blob = TextBlob(sample_text)

print("📄 Sentence Tokens:")
for sentence in blob.sentences:
    print("-", sentence)

print("\n🔤 Word Tokens:")
print(blob.words)


📄 Sentence Tokens:
-  Still, the two elephants in the room that no one is actually addressing.
- Who will buy all the stuff being made by robots and I when everyone is unemployed?
- Why is no one really discussing the implementation of UBI?
- The future won't work if we do not totally reconstruct the foundations of our society.

🔤 Word Tokens:
['Still', 'the', 'two', 'elephants', 'in', 'the', 'room', 'that', 'no', 'one', 'is', 'actually', 'addressing', 'Who', 'will', 'buy', 'all', 'the', 'stuff', 'being', 'made', 'by', 'robots', 'and', 'I', 'when', 'everyone', 'is', 'unemployed', 'Why', 'is', 'no', 'one', 'really', 'discussing', 'the', 'implementation', 'of', 'UBI', 'The', 'future', 'wo', "n't", 'work', 'if', 'we', 'do', 'not', 'totally', 'reconstruct', 'the', 'foundations', 'of', 'our', 'society']


In [11]:
original_df = pd.read_csv('generated_audio_data.csv')
transcribed_df = pd.read_csv('transcribed_results_100.csv')
merged = pd.merge(original_df, transcribed_df, on='Filename').dropna()

results = []

for _, row in merged.iterrows():
    original = str(row['Original'])
    transcribed = str(row['Transcription'])

    error = min(wer(original, transcribed), 1.0)
    accuracy = max((1 - error) * 100, 0)

    blob = TextBlob(transcribed)
    polarity = blob.sentiment.polarity
    sentiment = (
        "Positive" if polarity > 0.1 else
        "Negative" if polarity < -0.1 else
        "Neutral"
    )

    results.append({
        'Filename': row['Filename'],
        'Original Text': original,
        'Transcribed Text': transcribed,
        'WER': round(error, 2),
        'Accuracy %': round(accuracy, 2),
        'Sentiment': sentiment,
        'Polarity': round(polarity, 2)
    })

final_df = pd.DataFrame(results)
final_df.to_excel("final_dataset_100.xlsx", index=False)
print("✅ Final results saved as 'final_dataset_100.xlsx'")


✅ Final results saved as 'final_dataset_100.xlsx'


In [18]:

external_audio_path = "/content/He wasn’t looking to argue at all.mp3"


In [19]:
# Transcribe external audio
ext_result = model.transcribe(external_audio_path)
ext_text = ext_result['text']
print("🗣️ Transcription:\n", ext_text)


original_text = "Paste the exact sentence spoken in the audio"

# Calculate accuracy using WER
error = min(wer(original_text, ext_text), 1.0)
accuracy = max((1 - error) * 100, 0)

# Sentiment analysis
blob = TextBlob(ext_text)
polarity = blob.sentiment.polarity
sentiment = (
    "Positive" if polarity > 0.1 else
    "Negative" if polarity < -0.1 else
    "Neutral"
)

# Display results
print(f"\n📏 WER: {error:.2f}")
print(f"✅ Accuracy: {accuracy:.2f}%")
print(f"❤️ Sentiment: {sentiment} (Polarity: {polarity:.2f})")


🗣️ Transcription:
  Oh, wow! You alright? Yeah, I'd be further along the way. And you're not using a boss walk. You're right.

📏 WER: 1.00
✅ Accuracy: 0.00%
❤️ Sentiment: Positive (Polarity: 0.14)


In [21]:
# Calculate average model accuracy over all samples
accuracies = [row['Accuracy %'] for row in results]
average_accuracy = sum(accuracies) / len(accuracies)

print(f"\n✅ Overall Model Accuracy (on 100 samples): {average_accuracy:.2f}%")



✅ Overall Model Accuracy (on 100 samples): 71.04%
