In [None]:
#!pip install git+https://github.com/huggingface/transformers
!pip install qwen-omni-utils -U
!pip install -U bitsandbytes

import os
import soundfile as sf
import torch
import numpy as np
import librosa
import csv
import transformers

from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor
import pandas as pd

# --- ENABLE CUDA SYNCHRONOUS DEBUGGING AND EXPANDABLE MEMORY ALLOCATOR ---
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# --- DEBUG GPU AVAILABILITY ---
if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(f"*** GPU CONFIRMED: PyTorch detects GPU {i}! Name: {torch.cuda.get_device_name(i)}, Total Memory: {torch.cuda.get_device_properties(i).total_memory / (1024**3):.2f} GiB")
else:
    print("*** ATTENTION: PyTorch DOES NOT detect GPU. The model will be loaded on CPU (may fail due to memory).")

# --- 1. GENERAL CONFIGURATION AND MODEL/PROCESSOR LOADING ---
print("Loading Qwen2-Audio-7B-Instruct model...")
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
model = Qwen2AudioForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-Audio-7B-Instruct",
    device_map="auto",
    torch_dtype=torch.float16
)
print("Qwen2-Audio model and processor loaded successfully.")
torch.cuda.empty_cache()
print("CUDA cache emptied after model loading.")

# --- METRICS FOR TOKEN TRACKING ---
total_output_tokens = 0
response_count = 0
output_token_lengths = []
token_stats_filename = "/kaggle/working/qwen_audio_token_statistics_full_folder.csv"

# --- FUNCTION FOR INFERENCE ON A SINGLE AUDIO SEGMENT ---
def run_audio_inference_on_segment(audio_segment: np.ndarray, user_prompt: str, model, processor, segment_samplerate: int, max_new_tokens: int = 512) -> tuple[str, int]:
    """
    Performs Qwen2-Audio inference on a single audio segment (numpy array) with a given prompt.
    Includes the count of generated tokens.
    """
    print(f"\n--- Starting inference on audio segment (length: {len(audio_segment)/segment_samplerate:.2f}s) ---")

    target_sr = 16000
    if segment_samplerate != target_sr:
        print(f"Resampling audio segment from {segment_samplerate}Hz to {target_sr}Hz.")
        audio_for_processor = librosa.resample(y=audio_segment, orig_sr=segment_samplerate, target_sr=target_sr)
    else:
        audio_for_processor = audio_segment
    final_samplerate = target_sr

    # MODIFIED: System prompt to instruct English output
    system_prompt = """You are a highly specialized and descriptive music analyst.
Your task is to analyze the provided audio segment and provide a  detailed description, covering the requested aspects.
Your response MUST be in English.
You must include:
1.  **Detailed Description:** Instruments, tempo/rhythm, timbre, emotions evoked, and an imagined context. 
"""

    conversation = [
        {
            "role": "system",
            "content": system_prompt
        },
        {
            "role": "user",
            "content": [
                {"type": "audio", "audio": audio_for_processor},
                {"type": "text", "text": user_prompt},
            ],
        },
    ]

    print("Preparing inputs for Qwen2-Audio model...")
    text_input_formatted = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
    
    audios_np_arrays = [audio_for_processor]

    inputs = processor(text=text_input_formatted, audios=audios_np_arrays, sampling_rate=final_samplerate, return_tensors="pt", padding=True)

    inputs.input_ids = inputs.input_ids.to(model.device)
    if "input_features" in inputs:
        inputs["input_features"] = inputs["input_features"].to(model.device)
    elif "pixel_values" in inputs:
        inputs["pixel_values"] = inputs["pixel_values"].to(model.device)
    else:
        print("Warning: No 'input_features' or 'pixel_values' found in inputs. Check model documentation.")

    print(f"Length of input_ids after tokenization: {inputs.input_ids.size(1)}")
    print("Inputs ready for generation.")

    torch.cuda.empty_cache()

    print("Generating model response... This may take time.")
    generate_ids = model.generate(**inputs, max_new_tokens=max_new_tokens,temperature=0.7)

    generated_ids_only = generate_ids[:, inputs.input_ids.size(1):]
    response_text = processor.batch_decode(generated_ids_only, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    
    num_generated_tokens = generated_ids_only.size(1)

    print("\n--- GENERATED TEXT FROM MODEL ---\n")
    print(response_text)
    print(f"Tokens generated: {num_generated_tokens}")
    print(f"--- End of inference on audio segment ---\n")

    return response_text, num_generated_tokens

# --- MAIN SECTION: SPLITTING AND ANALYZING A SINGLE AUDIO ---

# Set the path to the folder containing your audio files.
audio_directory_path = "/kaggle/input/le-musiche/PiccoloSubSetAudio/"
# Set the paths to the metadata files (tracks.csv and features.csv).
tracks_path = '/kaggle/input/fma-free-music-archive-small-medium/fma_metadata/tracks.csv'
features_path = '/kaggle/input/fma-free-music-archive-small-medium/fma_metadata/features.csv'

# Verify that folders/files exist.
if not os.path.exists(audio_directory_path):
    raise FileNotFoundError(f"Audio folder not found: {audio_directory_path}. Make sure you have uploaded the folder as a Dataset and specified the correct path.")
if not os.path.exists(tracks_path):
    raise FileNotFoundError(f"tracks.csv not found: {tracks_path}. Make sure you have uploaded the FMA dataset and specified the correct path.")
if not os.path.exists(features_path):
    raise FileNotFoundError(f"features.csv not found: {features_path}. Make sure you have uploaded the FMA dataset and specified the correct path.")

# --- Loading and preparing metadata (genre and features) ---
print(f"Loading metadata from: {tracks_path} and {features_path}")

tracks_df = None
features_df = None

try:
    tracks_df = pd.read_csv(tracks_path, header=[0, 1], index_col=0, low_memory=False)
    print(f"✅ 'track' columns in tracks.csv: {tracks_df['track'].columns.tolist()}") # Debug print for tracks columns
except Exception as e:
    print(f"❌ Error loading tracks.csv: {e}")

try:
    features_df = pd.read_csv(features_path, header=[0, 1], index_col=0, low_memory=False)
    print(f"✅ 'features' columns in features.csv: {features_df.columns.tolist()}") # Debug print for features columns
except Exception as e:
    print(f"❌ Error loading features.csv: {e}")

def get_song_metadata_and_features(track_id_int: int) -> tuple[str, str]:
    """
    Gets the genre and a summary of features for a given track_id.
    Returns the genre and a formatted string with features.
    """
    genre = "unknown"
    features_info_str = "No features available."

    # Get Genre
    if tracks_df is not None and track_id_int in tracks_df.index:
        try:
            genre = tracks_df.loc[track_id_int, ('track', 'genre_top')]
            if pd.isna(genre): # Handle NaN genres
                genre = "unknown"
        except KeyError:
            print(f"WARNING: 'genre_top' column not found for track ID {track_id_int} in tracks.csv.")
            genre = "unknown"
    else:
        print(f"WARNING: Track ID {track_id_int} not found in tracks.csv.")

    # Get Features
    if features_df is not None and track_id_int in features_df.index:
        # Define features to extract. Adjust these based on what you find most useful.
        selected_features = {
            "Average BPM": ('rhythm', 'bpm'),
            "Average Loudness (LUFS)": ('lowlevel', 'average_loudness'), # Common loudness feature in FMA
            "Average Spectral Centroid": ('lowlevel', 'spectral_centroid', 'mean'),
            # Add more features as needed:
            # "Average Spectral Contrast": ('lowlevel', 'spectral_contrast', 'mean'),
            # "Average Spectral Rolloff": ('lowlevel', 'spectral_rolloff', 'mean'),
        }
        
        extracted_features = []
        for desc, col_tuple in selected_features.items():
            try:
                # Accessing multi-level columns
                feature_value = features_df.loc[track_id_int, col_tuple]
                if pd.notna(feature_value): # Check for NaN values
                    extracted_features.append(f"{desc}: {feature_value:.2f}")
            except KeyError:
                pass # Silently skip if feature column doesn't exist for this track

        if extracted_features:
            features_info_str = "Feature information: " + ", ".join(extracted_features) + "."
        else:
            features_info_str = "No specific features found for this track."
    else:
        print(f"WARNING: Track ID {track_id_int} not found in features.csv.")
        features_info_str = "No features available."

    return genre, features_info_str

# --- MODIFIED HERE: Add /kaggle/working/ path
csv_output_filename = "/kaggle/working/audio_analysis_full_folder_with_features.csv" # Changed filename to reflect English output

# Define the number of segments (global or at the beginning of your script)
N_FIXED_SEGMENTS = 4

# Define the column names that will always be present
base_fieldnames = ['Song Name', 'Genre', 'Features Info', 'Description Type', 'Description'] # Changed column names to English


audio_files_to_process = [f for f in os.listdir(audio_directory_path) if f.endswith(('.mp3', '.wav', '.flac'))]
audio_files_to_process.sort() # Process in a consistent order

print(f"\nFound {len(audio_files_to_process)} audio files in folder: {audio_directory_path}")

try:
    with open(csv_output_filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=base_fieldnames)
        try:
            writer.writeheader()
        except Exception as e:
            print(f"ERROR: Unable to write CSV header for '{csv_output_filename}': {e}")
            exit()

        print(f"\nAnalysis results will be saved in: {csv_output_filename}")

        for filename in audio_files_to_process:
            audio_file_path = os.path.join(audio_directory_path, filename)
            print(f"\n\n--- START PROCESSING FILE: {filename} ({audio_files_to_process.index(filename) + 1}/{len(audio_files_to_process)}) ---")

            track_id_from_filename_str = os.path.splitext(filename)[0]
            current_genre = "unknown"
            current_features_info = "No features available."
            try:
                track_id_int_for_lookup = int(track_id_from_filename_str)
                current_genre, current_features_info = get_song_metadata_and_features(track_id_int_for_lookup)
            except ValueError:
                print(f"ATTENTION: Could not convert filename '{track_id_from_filename_str}' to integer for metadata lookup. Genre and features unknown.")
                
            print(f"Genre detected for '{filename}': {current_genre}")
            print(f"Features for '{filename}': {current_features_info}")

            try:
                full_audio_data, full_samplerate = librosa.load(audio_file_path, sr=None)
                audio_duration = len(full_audio_data) / full_samplerate
                print(f"Full audio length: {audio_duration:.2f} seconds.")
            except Exception as e:
                print(f"ERROR: Unable to load audio file '{filename}': {e}. Skipping this file.")
                continue

            # --- AUDIO SEGMENT ANALYSIS (EXECUTION AND IMMEDIATE SAVING) ---
            print(f"\n--- SPLITTING AND ANALYZING AUDIO INTO {N_FIXED_SEGMENTS} FRAGMENTS ---")
            
            segment_duration_seconds = audio_duration / N_FIXED_SEGMENTS
            samples_per_segment = int(segment_duration_seconds * full_samplerate)
            
            # MODIFIED: User instruction for segments
            base_user_instruction = f"""The genre of the song is '{current_genre}'. {current_features_info} Describe the provided song segment, including the following aspects:
1.  **Instruments present**
2.  **Tempo/rhythm**
3.  **Timbre**
4.  **Emotions evoked**
5.  **Imagined context**
Be as descriptive as possible while keeping the length within the limit.
Your response MUST be in English.
"""

            for i in range(N_FIXED_SEGMENTS):
                start_sample = i * samples_per_segment
                end_sample = min((i + 1) * samples_per_segment, len(full_audio_data))
                
                current_segment = full_audio_data[start_sample:end_sample]

                segment_prompt = f"Analyze fragment {i+1}/{N_FIXED_SEGMENTS} of the audio. {base_user_instruction}"
                
                response_text, num_tokens_segment = run_audio_inference_on_segment(
                    audio_segment=current_segment,
                    user_prompt=segment_prompt,
                    model=model,
                    processor=processor,
                    segment_samplerate=full_samplerate,
                    max_new_tokens=512
                )
                
                total_output_tokens += num_tokens_segment
                response_count += 1
                output_token_lengths.append(num_tokens_segment)
                
                torch.cuda.empty_cache()
                print(f"CUDA cache emptied after analyzing fragment {i+1} for {filename}.")

                # --- Write row for each fragment (Song Name only on the first row for the song) ---
                try:
                    writer.writerow({
                        'Song Name': filename if i == 0 else '', # Song Name only for the first fragment of the song
                        'Genre': current_genre if i == 0 else '', # Genre only for the first fragment
                        'Features Info': current_features_info if i == 0 else '', # Features only for the first fragment
                        'Description Type': f'Fragment {i+1}', # Changed to English
                        'Description': response_text
                    })
                except Exception as e:
                    print(f"ERROR: Unable to write row for fragment {i+1} for '{filename}': {e}")
            
            # --- FULL AUDIO ANALYSIS (POSTPONED EXECUTION AND SAVING) ---
            print("\n--- EXECUTING ANALYSIS ON THE ENTIRE AUDIO FILE ---")
            # MODIFIED: User instruction for full song
            full_song_prompt = f"""Analyze the entire song. Its genre is '{current_genre}'. {current_features_info} Describe  the entire provided song, including the following aspects:
1.  **Instruments present**
2.  **Overall tempo/rhythm**
3.  **General timbre**
4.  **Emotions evoked by the entire track**
5.  **Imagined context for the entire song**
Be as descriptive as possible while keeping the length within the limit.
Your response MUST be in English.
"""
            overall_description, num_tokens_overall = run_audio_inference_on_segment(
                audio_segment=full_audio_data,
                user_prompt=full_song_prompt,
                model=model,
                processor=processor,
                segment_samplerate=full_samplerate,
                max_new_tokens=512
            )
            total_output_tokens += num_tokens_overall
            response_count += 1
            output_token_lengths.append(num_tokens_overall)

            torch.cuda.empty_cache()
            print(f"CUDA cache emptied after full analysis of {filename}.")

            # --- Write row for total description (after fragments) ---
            try:
                writer.writerow({
                    'Song Name': '', # Leave blank for visual alignment
                    'Genre': '',
                    'Features Info': '',
                    'Description Type': 'Total', # Changed to English
                    'Description': overall_description
                })
            except Exception as e:
                print(f"ERROR: Unable to write total description row for '{filename}': {e}")

            print(f"--- END FILE PROCESSING: {filename} ---\n")
            
            torch.cuda.empty_cache()
            print(f"CUDA cache emptied after complete processing of {filename}.")

except Exception as e:
    print(f"SEVERE ERROR: Unable to open or write to file '{csv_output_filename}': {e}")

print(f"\nAudio analysis process for the entire folder completed. Results saved in '{csv_output_filename}'.")

# --- CALCULATION AND SAVING OF OUTPUT TOKEN STATISTICS TO CSV ---
print(f"\nSaving output token statistics to '{token_stats_filename}'...")

try:
    with open(token_stats_filename, 'w', newline='', encoding='utf-8') as stats_csvfile:
        stats_fieldnames = [
            'Metric', 'Value',
            'Total_Responses_Count', 'Average_Tokens', 'Min_Tokens', 'Max_Tokens', # Changed column names to English
            'Above_Average', 'Below_Average', 'Exactly_Average' # Changed column names to English
        ]
        stats_writer = csv.DictWriter(stats_csvfile, fieldnames=stats_fieldnames)
        stats_writer.writeheader()

        if response_count > 0:
            average_tokens = total_output_tokens / response_count
            max_tokens = max(output_token_lengths)
            min_tokens = min(output_token_lengths)

            above_average_count = sum(1 for length in output_token_lengths if length > average_tokens)
            below_average_count = sum(1 for length in output_token_lengths if length < average_tokens)
            at_average_count = response_count - above_average_count - below_average_count

            stats_writer.writerow({
                'Metric': 'Overall Statistics',
                'Value': 'N/A',
                'Total_Responses_Count': response_count,
                'Average_Tokens': f"{average_tokens:.2f}",
                'Min_Tokens': min_tokens,
                'Max_Tokens': max_tokens,
                'Above_Average': above_average_count,
                'Below_Average': below_average_count,
                'Exactly_Average': at_average_count
            })
            
            print("\n--- OUTPUT TOKEN STATISTICS ---")
            print(f"Total number of responses generated: {response_count}")
            print(f"Average output tokens per response: {average_tokens:.2f}")
            print(f"Maximum tokens in a response: {max_tokens}")
            print(f"Minimum tokens in a response: {min_tokens}")
            print(f"Responses with tokens above average: {above_average_count}")
            print(f"Responses with tokens below average: {below_average_count}")
            print(f"Responses with tokens exactly at average: {at_average_count}")
        else:
            stats_writer.writerow({
                'Metric': 'No responses generated',
                'Value': '0',
                'Total_Responses_Count': 0, 'Average_Tokens': 0, 'Min_Tokens': 0, 'Max_Tokens': 0,
                'Above_Average': 0, 'Below_Average': 0, 'Exactly_Average': 0
            })
            print("\nNo responses generated to calculate token statistics.")

    print(f"Token statistics saved in '{token_stats_filename}'.")

except Exception as e:
    print(f"SEVERE ERROR: Unable to save token statistics to '{token_stats_filename}': {e}")

# --- FINAL VERIFICATION: FILES IN /kaggle/working/ FOLDER ---
print("\n" + "="*50)
print("--- FINAL VERIFICATION: FILES IN /kaggle/working/ FOLDER ---")
print("="*50)
try:
    for filename in os.listdir("/kaggle/working/"):
        file_path = os.path.join("/kaggle/working/", filename)
        if os.path.isfile(file_path):
            file_size = os.path.getsize(file_path) / (1024 * 1024) # Size in MB
            print(f"- {filename} (Size: {file_size:.2f} MB)")
        else:
            print(f"- {filename} (Directory)")
except Exception as e:
    print(f"ERROR listing files in /kaggle/working/: {e}")
print("="*50)
print("--- END VERIFICATION ---")



2025-06-16 20:34:23.987480: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750106064.011200     406 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750106064.018118     406 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


*** GPU CONFIRMED: PyTorch detects GPU 0! Name: Tesla T4, Total Memory: 14.74 GiB
*** GPU CONFIRMED: PyTorch detects GPU 1! Name: Tesla T4, Total Memory: 14.74 GiB
Loading Qwen2-Audio-7B-Instruct model...


Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Qwen2-Audio model and processor loaded successfully.
CUDA cache emptied after model loading.
Loading metadata from: /kaggle/input/fma-free-music-archive-small-medium/fma_metadata/tracks.csv and /kaggle/input/fma-free-music-archive-small-medium/fma_metadata/features.csv
✅ 'track' columns in tracks.csv: ['bit_rate', 'comments', 'composer', 'date_created', 'date_recorded', 'duration', 'favorites', 'genre_top', 'genres', 'genres_all', 'information', 'interest', 'language_code', 'license', 'listens', 'lyricist', 'number', 'publisher', 'tags', 'title']
✅ 'features' columns in features.csv: [('chroma_cens', 'kurtosis'), ('chroma_cens', 'kurtosis.1'), ('chroma_cens', 'kurtosis.2'), ('chroma_cens', 'kurtosis.3'), ('chroma_cens', 'kurtosis.4'), ('chroma_cens', 'kurtosis.5'), ('chroma_cens', 'kurtosis.6'), ('chroma_cens', 'kurtosis.7'), ('chroma_cens', 'kurtosis.8'), ('chroma_cens', 'kurtosis.9'), ('chroma_cens', 'kurtosis.10'), ('chroma_cens', 'kurtosis.11'), ('chroma_cens', 'max'), ('chroma_cen

  inputs = processor(text=text_input_formatted, audios=audios_np_arrays, sampling_rate=final_samplerate, return_tensors="pt", padding=True)



--- GENERATED TEXT FROM MODEL ---

A male rapper sings this hip-hop piece. There are no other instruments accompanying him. The tempo is medium with a steady drumming rhythm and electronic beats. The beat has a deep and strong bass which gives it a groovy feel. The overall mood of the song is chill and easygoing. It could fit perfectly on a summer day or during a relaxed hangout session with friends.
Tokens generated: 78
--- End of inference on audio segment ---

CUDA cache emptied after analyzing fragment 1 for 000002.mp3.

--- Starting inference on audio segment (length: 7.49s) ---
Resampling audio segment from 44100Hz to 16000Hz.
Preparing inputs for Qwen2-Audio model...
Length of input_ids after tokenization: 375
Inputs ready for generation.
Generating model response... This may take time.

--- GENERATED TEXT FROM MODEL ---

The provided song segment is from a hip-hop track. The instruments present include drums, bass, and possibly keyboards or synthesizers given the electronic na