<a href="https://colab.research.google.com/github/BuTcheR0512/Data-Science/blob/main/Cambai.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing the necessary Librarys

In [None]:
%pip install youtube_search
%pip install pytube
%pip install youtube_transcript_api
%pip install pydub
%pip install SpeechRecognition
%pip install librosa
!apt-get install ffmpeg

# Data Collection:
 Searching the Irish YouTube channel and Downloading the latest 2 videos from it.

 Extracting the audio and transcripts from the video.

In [None]:
from pytube import YouTube
import os
from youtube_search import YoutubeSearch
from youtube_transcript_api import YouTubeTranscriptApi

def search_and_download_irish_videos(query, output_path, max_results=10):
    # Perform YouTube search using the query
    results = YoutubeSearch(query, max_results=max_results).to_dict()

    # Create folders for audio files and transcripts
    audio_folder = os.path.join(output_path, "audio")
    transcript_folder = os.path.join(output_path, "transcripts")
    os.makedirs(audio_folder, exist_ok=True)
    os.makedirs(transcript_folder, exist_ok=True)

    # Iterate over search results
    for video in results:
        video_id = video['id']
        video_title = video['title']

        try:
            # Download video
            yt = YouTube(f'https://www.youtube.com/watch?v={video_id}')
            stream = yt.streams.filter(file_extension='mp4').first()
            video_file = stream.download(output_path=output_path, filename=f'{video_id}')
            print(f"Downloaded video: {video_title}")

            # Download audio
            stream_audio = yt.streams.filter(only_audio=True).first()
            audio_file = stream_audio.download(output_path=audio_folder, filename=f'{video_id}.wav')
            print(f"Downloaded audio: {video_title}")

            # Download transcript
            transcript = YouTubeTranscriptApi.get_transcript(video_id)
            transcript_text = ' '.join([line['text'] for line in transcript])
            transcript_file = os.path.join(transcript_folder, f'{video_id}.txt')
            with open(transcript_file, 'w', encoding='utf-8') as f:
                f.write(transcript_text)
            print(f"Downloaded transcript: {video_title}")

        except Exception as e:
            print(f"Error downloading {video_title}: {e}")

# Example usage:
query = "Gaeilge i mo chroí"
output_path = "/content/files"
search_and_download_irish_videos(query, output_path, max_results=2)


Downloaded video: A casual conversation as Gaeilge with Gaeilge le Jane 🥰 💚 MUNSTER DIALECT, Gaeltacht Mhúscraí
Downloaded audio: A casual conversation as Gaeilge with Gaeilge le Jane 🥰 💚 MUNSTER DIALECT, Gaeltacht Mhúscraí
Downloaded transcript: A casual conversation as Gaeilge with Gaeilge le Jane 🥰 💚 MUNSTER DIALECT, Gaeltacht Mhúscraí
Downloaded video: How to start speaking Irish | Gaeilge i Mo Chroí
Downloaded audio: How to start speaking Irish | Gaeilge i Mo Chroí
Downloaded transcript: How to start speaking Irish | Gaeilge i Mo Chroí


In [None]:
# import os

# # Set the path to your service account key file
# os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/content/potent-cedar-415705-417bcf87622a.json"

# Data Preprocessing:
 Encoding the audio files.

In [None]:
import os


audio_folder = '/content/files/audio'


output_folder = '/content/files/encoded audio'

# Ensure the output folder exists
os.makedirs(output_folder, exist_ok=True)

# Iterate over each file in the audio folder
for filename in os.listdir(audio_folder):
    if filename.endswith('.wav'):
        # Generate input and output file paths
        input_file = os.path.join(audio_folder, filename)
        output_file = os.path.join(output_folder, filename)

        # Execute FFmpeg command to re-encode the audio file
        !ffmpeg -i "$input_file" -c:a pcm_s16le "$output_file"


Auto align the audio and the transcript in a folder .

In [None]:
import os
import shutil

def align_audio_transcripts(audio_folder, transcript_folder, output_folder):
    # Ensure output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Get list of audio files
    audio_files = os.listdir(audio_folder)

    # Iterate over audio files
    for audio_file in audio_files:
        # Construct paths for audio and transcript files
        audio_path = os.path.join(audio_folder, audio_file)
        transcript_file = os.path.splitext(audio_file)[0] + '.txt'
        transcript_path = os.path.join(transcript_folder, transcript_file)

        # Check if transcript file exists
        if not os.path.exists(transcript_path):
            print(f"Transcript file not found for {audio_file}")
            continue

        # Construct output paths
        output_audio_path = os.path.join(output_folder, audio_file)
        output_transcript_path = os.path.join(output_folder, transcript_file)

        # Copy audio file to output folder
        shutil.copy(audio_path, output_audio_path)

        # Copy transcript file to output folder
        shutil.copy(transcript_path, output_transcript_path)

        print(f"Aligned {audio_file} and {transcript_file} saved in {output_folder}")

# Specify input and output folders
audio_folder = '/content/files/encoded audio'
transcript_folder = '/content/files/transcripts'
output_folder = '/content/files/aligned data'

# Call function to align audio and transcripts
align_audio_transcripts(audio_folder, transcript_folder, output_folder)


Transcript file not found for .ipynb_checkpoints
Aligned ViGAb66Nsdo.wav and ViGAb66Nsdo.txt saved in /content/files/aligned data
Aligned zqWQz8rOk98.wav and zqWQz8rOk98.txt saved in /content/files/aligned data


# Data Augmentation

In [None]:
from pydub import AudioSegment
from pydub.effects import speedup
import os

def augment_audio(input_folder, output_folder):
    # Create the output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Iterate over all files in the input folder
    for filename in os.listdir(input_folder):
        if filename.endswith(".wav"):
            input_file = os.path.join(input_folder, filename)
            output_file = os.path.join(output_folder, filename)

            # Load the audio file
            audio = AudioSegment.from_wav(input_file)

            # Apply speedup effect to the audio
            augmented_audio = speedup(audio, playback_speed=1.5)

            # Export the augmented audio to a new file
            augmented_audio.export(output_file, format="wav")
            print(f"Augmented audio saved as: {output_file}")

# Specify input and output folders
input_folder = "/content/files/encoded audio"
output_folder = "/content/files/augmented_audio"

# Augment audio files
augment_audio(input_folder, output_folder)


Augmented audio saved as: /content/files/augmented_audio/ViGAb66Nsdo.wav
Augmented audio saved as: /content/files/augmented_audio/zqWQz8rOk98.wav


# Dataset Creation

Run a frequency test to check if a speaker is male or female.(It may not be the prefect way but can be considered)

In [2]:
import librosa
import numpy as np

audio_file = "/content/files/audio/ViGAb66Nsdo.wav"
y, sr = librosa.load(audio_file, sr=None)

# Calculate the Fourier Transform
fft = np.fft.fft(y)
magnitude = np.abs(fft)
frequency = np.fft.fftfreq(len(magnitude), 1/sr)


dominant_frequency = frequency[np.argmax(magnitude)]


speed_of_sound = 343  # in meters per second

# Calculate wavelength
wavelength = speed_of_sound / dominant_frequency

print("Dominant Frequency:", dominant_frequency, "Hz")
print("Wavelength:", wavelength, "m")



  y, sr = librosa.load(audio_file, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Dominant Frequency: 45.097539144594506 Hz
Wavelength: 7.605736510372601 m
