In [4]:
import pandas as pd
from moviepy.editor import VideoFileClip
from tqdm import tqdm  # For the progress bar
import os


In [54]:
# Read the CSV file
csv_file = ["./Data/violin_raw_videos/parts/video_files_part_1.csv",
"./Data/violin_raw_videos/parts/video_files_part_2.csv",
"./Data/violin_raw_videos/parts/video_files_part_3.csv",
"./Data/violin_raw_videos/parts/video_files_part_4.csv",
"./Data/violin_raw_videos/parts/video_files_part_5.csv",
"./Data/violin_raw_videos/parts/video_files_part_6.csv",
"./Data/violin_raw_videos/parts/video_files_part_7.csv"]
video_paths = pd.read_csv(csv_file[0])

audio_file_path = "./Data/violin_raw_videos/audio_clips/"


In [55]:
def video2audio(v_path, a_path, codec='flac'):
    """
    Converts a video file to an audio file.
    
    Parameters:
        v_path (str): Path to the input video file.
        a_path (str): Path to save the extracted audio file.
        codec (str, optional): Audio codec to use (e.g., 'mp3', 'flac'). Defaults to None.
    
    Returns:
        bool: True if successful, False if an error occurred.
    """
    try:
        # Load the video file
        video_clip = VideoFileClip(v_path)
        
        # Extract the audio and save it
        video_clip.audio.write_audiofile(a_path, codec=codec, verbose=False, logger=None)
        return True
    except Exception as e:
        print(f"Error converting {v_path} to audio: {e}")
        return False

def get_file_name(path):
    return os.path.splitext(os.path.basename(path))[0]

def vpath2apath(v_path, header, extension = '.flac'):
    return header + get_file_name(video_path) + extension

## Save Flac
1. Read csv in parts
2. Read each line, preserve the base_name, sperate the audio and save in flac format
   1. Save audio clips under audio_clips
3. Save the audio paths in csv under audio_parts

In [61]:
# Load the CSV file
for i in range(7):
    file_path = f"./Data/violin_raw_videos/parts/video_files_part_{i+1}.csv"  # Replace with your file path
    output_file_path = f"./Data/violin_raw_videos/audio_parts/audio_files_part_{i+1}.csv"  # Output file path

    # Read the file into a DataFrame
    video_paths_df = pd.read_csv(file_path, header=None, names=["video_path"])
    video_paths = video_paths_df["video_path"].iloc[1:].tolist()  # Skip the first row if it's a header
    audio_paths = []

    with tqdm(total=len(video_paths), desc="Processing Videos") as pbar:
        for video_path in video_paths:
            video_path = f"./Data/violin_raw_videos/clips/{os.path.basename(video_path)}"
            if not os.path.exists(video_path):
                print(f"Video file not found: {video_path}")
                pbar.update(1)
                continue
            
            try:
                # audio output path
                audio_output_path = vpath2apath(video_path, audio_file_path)
                
                # Extract and save the audio
                video2audio(video_path, audio_output_path, codec='flac')
                # print(f"Audio saved: {audio_output_path}")
                # print(f"./audio_clips/{os.path.basename(audio_output_path)}")
                audio_output_path_csv = f"./audio_clips/{os.path.basename(audio_output_path)}"
                audio_paths.append(audio_output_path_csv)

            except Exception as e:
                print(f"Error processing {video_path}: {e}")
            finally:
                pbar.update(1)  # Update the progress bar
    
    # Create a new DataFrame for the modified list
    modified_df = pd.DataFrame(audio_paths, columns=["File Name"])

    # Save the modified list to a new CSV file
    modified_df.to_csv(output_file_path, index=False)

    print(f"Modified list saved to {output_file_path}")

Processing Videos: 100%|██████████| 2495/2495 [14:17<00:00,  2.91it/s]


Modified list saved to ./Data/violin_raw_videos/audio_parts/audio_files_part_1.csv


Processing Videos: 100%|██████████| 2495/2495 [14:21<00:00,  2.90it/s]


Modified list saved to ./Data/violin_raw_videos/audio_parts/audio_files_part_2.csv


Processing Videos: 100%|██████████| 2495/2495 [15:32<00:00,  2.68it/s]


Modified list saved to ./Data/violin_raw_videos/audio_parts/audio_files_part_3.csv


Processing Videos:  42%|████▏     | 1055/2495 [06:57<08:42,  2.76it/s]

Error converting ./Data/violin_raw_videos/clips/IelhpK-eDh0_clip_000_040.mp4 to audio: 'NoneType' object has no attribute 'write_audiofile'


Processing Videos:  88%|████████▊ | 2206/2495 [14:07<01:14,  3.88it/s]

Error converting ./Data/violin_raw_videos/clips/8osP7KRacWk_clip_000_040.mp4 to audio: 'NoneType' object has no attribute 'write_audiofile'


Processing Videos: 100%|██████████| 2495/2495 [15:56<00:00,  2.61it/s]


Modified list saved to ./Data/violin_raw_videos/audio_parts/audio_files_part_4.csv


Processing Videos: 100%|██████████| 2495/2495 [18:51<00:00,  2.20it/s]


Modified list saved to ./Data/violin_raw_videos/audio_parts/audio_files_part_5.csv


Processing Videos: 100%|██████████| 2495/2495 [16:22<00:00,  2.54it/s]


Modified list saved to ./Data/violin_raw_videos/audio_parts/audio_files_part_6.csv


Processing Videos: 100%|██████████| 2495/2495 [18:07<00:00,  2.29it/s]

Modified list saved to ./Data/violin_raw_videos/audio_parts/audio_files_part_7.csv





“./Data/violin_raw_videos/audio_parts/audio_files_part_4.csv”

Has 2 failed entries:

Error converting ./Data/violin_raw_videos/clips/IelhpK-eDh0_clip_000_040.mp4 to audio: 'NoneType' object has no attribute 'write_audiofile'

Error converting ./Data/violin_raw_videos/clips/8osP7KRacWk_clip_000_040.mp4 to audio: 'NoneType' object has no attribute 'write_audiofile'


## Check CSV

In [None]:
for i in range(7):
    audio_file_path = f"./Data/violin_raw_videos/audio_parts/audio_files_part_{i+1}.csv"  # Output file path

    # Read the file into a DataFrame
    audio_paths_df = pd.read_csv(audio_file_path, header=None, names=["audio_paths"])
    audio_paths = audio_paths_df["audio_paths"].iloc[1:].tolist()  # Skip the first row if it's a header

    for audio_path in audio_paths:
        audio_path = f"./Data/violin_raw_videos/audio_clips/{os.path.basename(audio_path)}"
        # print(audio_path)
        if not os.path.exists(audio_path):
            print(f"Audio file not found: {audio_path}")
            continue

./Data/violin_raw_videos/audio_clips/friends_s05e02_split_48_138.flac
./Data/violin_raw_videos/audio_clips/friends_s05e20_clip_697_724.flac
./Data/violin_raw_videos/audio_clips/-1U0LH6dPfw_clip_000_040.flac
./Data/violin_raw_videos/audio_clips/friends_s03e03_split_571_661.flac
./Data/violin_raw_videos/audio_clips/dh_s04e12_clip_1575_1601.flac
./Data/violin_raw_videos/audio_clips/dh_s01e01_split_557_647.flac
./Data/violin_raw_videos/audio_clips/friends_s01e22_split_135_225.flac
./Data/violin_raw_videos/audio_clips/dh_s07e07_clip_1122_1155.flac
./Data/violin_raw_videos/audio_clips/SuSUwDgtq1g_clip_000_040.flac
./Data/violin_raw_videos/audio_clips/f-DgdMpSo7c_clip_000_040.flac
./Data/violin_raw_videos/audio_clips/dh_s04e11_clip_1246_1277.flac
./Data/violin_raw_videos/audio_clips/dh_s08e12_clip_2332_2367.flac
./Data/violin_raw_videos/audio_clips/dh_s08e01_clip_1772_1803.flac
./Data/violin_raw_videos/audio_clips/dh_s07e13_clip_644_680.flac
./Data/violin_raw_videos/audio_clips/6xaUD1jKFWg_cl