In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Read the CSV file
df = pd.read_csv('dataset.csv')

# Shuffle the data
df = df.sample(frac=1).reset_index(drop=True)
df.head()

In [None]:
# Verify Class balance "Type" column
df['Type'].value_counts()  

In [None]:
# Verify Class balance "Factual/Subjective" column
df['Factual/Subjective'].value_counts()  

In [None]:
# Verify Class balance "Sentiment" column
df['Sentiment'].value_counts()  

In [None]:
# Check for missing values
df.isnull().sum()

In [None]:
# Verify Class balance "Type" column
print(df['Type'].value_counts())
print("-----------------------------")
# Verify Class balance "Factual/Subjective" column
print(df['Factual/Subjective'].value_counts())
print("-----------------------------")
# Verify Class balance "Sentiment" column
print(df['Sentiment'].value_counts())
print("-----------------------------")

In [None]:
# verify and print all repeated values in the column Sentence
print(df['Sentence'].value_counts())

In [None]:
# Drop duplicates
df = df.drop_duplicates(subset='Sentence')

In [None]:
df

In [None]:
# Verify Class balance "Type" column
print(df['Type'].value_counts())
print("-----------------------------")
# Verify Class balance "Factual/Subjective" column
print(df['Factual/Subjective'].value_counts())
print("-----------------------------")
# Verify Class balance "Sentiment" column
print(df['Sentiment'].value_counts())
print("-----------------------------")

In [None]:
# implement textblob library to calculate the polarity of each sentence
from textblob import TextBlob
df['polarity'] = df['Sentence'].apply(lambda x: TextBlob(x).sentiment.polarity)
df.head()


In [None]:
# textblob library to calculate the subjectivity of each sentence
df['subjectivity'] = df['Sentence'].apply(lambda x: TextBlob(x).sentiment.subjectivity)
df.head()


In [None]:
df

In [None]:
# implement textblob to make the polarity and subjectivity columns more readable
def sentiment(x):
    if x < 0:
        return 'Negative'
    elif x == 0:
        return 'Neutral'
    else:
        return 'Positive'
    
df['polarity'] = df['polarity'].apply(lambda x: sentiment(x))
df['subjectivity'] = df['subjectivity'].apply(lambda x: sentiment(x))
df.head()

In [None]:
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
# Initialize sentence embedding model
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

# Encode sentences into embeddings
embeddings = model.encode(df["Sentence"].tolist())

# Label encoding
type_mapping = {"Affirmation": 0, "Negation": 1}
fact_subj_mapping = {"Factual": 0, "Subjective": 1}
sentiment_mapping = {"Sadness": 0, "Anger": 1, "Neutral": 2, "Happiness": 3, "Euphoria": 4}

df["Type"] = df["Type"].map(type_mapping).fillna(-1).astype(int)
df["Factual/Subjective"] = df["Factual/Subjective"].map(fact_subj_mapping).fillna(-1).astype(int)
df["Sentiment"] = df["Sentiment"].map(sentiment_mapping).fillna(-1).astype(int)

# Convert labels to NumPy arrays
type_labels = df["Type"].values
fact_subj_labels = df["Factual/Subjective"].values
sentiment_labels = df["Sentiment"].values

In [None]:
# Stratified Train-Test Split (based on Sentiment)
X_train, X_test, y_type_train, y_type_test, y_fact_train, y_fact_test, y_sent_train, y_sent_test = train_test_split(
    embeddings, type_labels, fact_subj_labels, sentiment_labels,
    test_size=0.2, random_state=42, stratify=sentiment_labels  # Ensuring class balance
)

In [None]:
# Hyperparameter Grid for XGBoost
param_grid = {
    "n_estimators": [50, 100, 200, 500],
    "max_depth": [3, 5, 7, 10],
    "learning_rate": [0.01, 0.1, 0.3],
    "subsample": [0.7, 1.0]
}

In [None]:
# Function to train and evaluate XGBoost with hyperparameter search
def train_xgboost(X_train, y_train, X_test, y_test, name):
    model = XGBClassifier(eval_metric="mlogloss")
    grid_search = GridSearchCV(model, param_grid, cv=3, scoring="accuracy", n_jobs=-1)
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    print(f"Best XGBoost Model for {name}: {grid_search.best_params_}")
    print(f"Accuracy for {name}: {acc:.4f}\n")

    return best_model

In [None]:
# Train and evaluate XGBoost models
type_model = train_xgboost(X_train, y_type_train, X_test, y_type_test, "Type Classification")

In [None]:
fact_model = train_xgboost(X_train, y_fact_train, X_test, y_fact_test, "Factual/Subjective Classification")

In [None]:
sentiment_model = train_xgboost(X_train, y_sent_train, X_test, y_sent_test, "Sentiment Classification")

### Whisper

In [None]:
import whisper

# Carregar o modelo (podes usar "tiny", "base", "small", "medium", "large")
model = whisper.load_model("small")

# Transcrever o áudio
audio_path = "audio.mp3"  # Substituir pelo nome do teu ficheiro
result = model.transcribe(audio_path)

# Exibir o texto reconhecido
print("Texto reconhecido:")
print(result["text"])


In [None]:
result = model.transcribe(audio_path, language="pt", fp16=False)  # Força uso de float32 para CPUs


In [None]:
import sounddevice as sd
import numpy as np
import wave
import time
from pydub import AudioSegment
import whisper

# Configuração da gravação
SAMPLE_RATE = 44100  # Qualidade de áudio
CHANNELS = 1         # Mono

# Variável global para controlar a gravação
recording = False

def start_recording(filename="output.wav", duration=None):
    """Começa a gravar áudio e guarda como WAV"""
    global recording
    recording = True
    print("🎤 Gravando... Pressiona Ctrl+C para parar.")
    
    frames = []
    
    def callback(indata, frames_count, time_info, status):
        if recording:
            frames.append(indata.copy())

    with sd.InputStream(samplerate=SAMPLE_RATE, channels=CHANNELS, callback=callback):
        if duration:
            time.sleep(duration)
            stop_recording(filename, frames)
        else:
            try:
                while recording:
                    time.sleep(0.1)
            except KeyboardInterrupt:
                stop_recording(filename, frames)

def stop_recording(filename, frames):
    """Para a gravação e salva o arquivo como WAV"""
    global recording
    recording = False
    print("⏹️ Gravação terminada. Salvando arquivo...")

    # Converter para NumPy array
    audio_data = np.concatenate(frames, axis=0)

    # Salvar como WAV
    with wave.open(filename, "wb") as wf:
        wf.setnchannels(CHANNELS)
        wf.setsampwidth(2)
        wf.setframerate(SAMPLE_RATE)
        wf.writeframes((audio_data * 32767).astype(np.int16).tobytes())

    # Converter para MP3
    convert_to_mp3(filename)
    
def convert_to_mp3(wav_filename):
    """Converte um arquivo WAV para MP3"""
    mp3_filename = wav_filename.replace(".wav", ".mp3")
    audio = AudioSegment.from_wav(wav_filename)
    audio.export(mp3_filename, format="mp3")
    print(f"🎵 Arquivo salvo como {mp3_filename}")
    return mp3_filename


def transcribe_audio(audio_path):
    """Usa Whisper para transcrever o áudio"""
    print("📝 Transcrevendo o áudio...")
    model = whisper.load_model("small")
    result = model.transcribe(audio_path, language="en")
    
    print("📜 Transcrição:")
    print(result["text"])

# Executar gravação e transcrição
if __name__ == "__main__":
    audio_file = "output.wav"
    start_recording(audio_file)  # Pressiona Ctrl+C para parar a gravação
    mp3_file = audio_file.replace(".wav", ".mp3")
    transcribe_audio(mp3_file)


In [9]:

def transcribe_audio(audio_path):
    """Usa Whisper para transcrever o áudio"""
    print("📝 Transcrevendo o áudio...")
    model = whisper.load_model("small")
    result = model.transcribe(audio_path, language="en")
    
    print("📜 Transcrição:")
    print(result["text"])

# Executar gravação e transcrição
if __name__ == "__main__":
    audio_file = "output.wav"
    #start_recording(audio_file)  # Pressiona Ctrl+C para parar a gravação
    mp3_file = audio_file.replace(".wav", ".mp3")
    transcribe_audio(mp3_file)


📝 Transcrevendo o áudio...


FileNotFoundError: [WinError 2] El sistema no puede encontrar el archivo especificado

In [11]:
import sounddevice as sd
import numpy as np
import wave
import time
from pydub import AudioSegment
import whisper

# Configuração da gravação
SAMPLE_RATE = 44100  # Qualidade do áudio
CHANNELS = 1         # Mono
DURATION = 10        # Tempo máximo de gravação (segundos)

def record_audio(filename="output.wav", duration=DURATION):
    """Grava áudio por um tempo máximo e salva como WAV"""
    print(f"🎤 Gravando... ({duration} segundos)")
    
    audio_data = sd.rec(int(duration * SAMPLE_RATE), samplerate=SAMPLE_RATE, channels=CHANNELS, dtype=np.int16)
    sd.wait()  # Espera a gravação terminar
    
    # Salvar como WAV
    with wave.open(filename, "wb") as wf:
        wf.setnchannels(CHANNELS)
        wf.setsampwidth(2)
        wf.setframerate(SAMPLE_RATE)
        wf.writeframes(audio_data.tobytes())

    print("⏹️ Gravação concluída.")
    return filename

def convert_to_mp3(wav_filename):
    """Converte um arquivo WAV para MP3"""
    mp3_filename = wav_filename.replace(".wav", ".mp3")
    audio = AudioSegment.from_wav(wav_filename)
    audio.export(mp3_filename, format="mp3")
    print(f"🎵 Arquivo salvo como {mp3_filename}")
    return mp3_filename

def transcribe_audio(audio_path):
    """Usa Whisper para transcrever o áudio"""
    print("📝 Transcrevendo o áudio...")
    model = whisper.load_model("small")
    result = model.transcribe(audio_path, language="en")
    
    print("📜 Transcrição:")
    print(result["text"])

# Executar gravação e transcrição
if __name__ == "__main__":
    audio_file = record_audio("output.wav")  # Grava automaticamente por 10 segundos
    mp3_file = convert_to_mp3(audio_file)    # Converte para MP3
    transcribe_audio(mp3_file)               # Transcreve o áudio


🎤 Gravando... (10 segundos)
⏹️ Gravação concluída.


FileNotFoundError: [WinError 2] El sistema no puede encontrar el archivo especificado

In [11]:
import whisper
import os

# Carregar o modelo (podes usar "tiny", "base", "small", "medium", "large")
model = whisper.load_model("small")

from pydub import AudioSegment

# Testar se o arquivo WAV foi gravado corretamente antes da conversão
wav_file = "output.wav"
if os.path.exists(wav_file):
    print("✅ O arquivo WAV existe. Convertendo para MP3...")
    
    audio = AudioSegment.from_wav(wav_file)
    audio.export("output.mp3", format="mp3")
    print("✅ Conversão concluída!")
else:
    print("❌ O arquivo WAV NÃO foi encontrado.")


#result = model.transcribe(audio_path)

# Exibir o texto reconhecido
print("Texto reconhecido:")
print(result["text"])


✅ O arquivo WAV existe. Convertendo para MP3...


FileNotFoundError: [WinError 2] El sistema no puede encontrar el archivo especificado

In [None]:
import os
import whisper
import sounddevice as sd
import numpy as np
import wave
from pydub import AudioSegment

# 🔹 Gravar áudio do microfone e salvar como WAV
def record_audio(filename="output.wav", duration=5, samplerate=44100):
    print("🎙️ Gravando... Fale algo!")
    audio_data = sd.rec(int(duration * samplerate), samplerate=samplerate, channels=1, dtype=np.int16)
    sd.wait()
    
    with wave.open(filename, "wb") as wf:
        wf.setnchannels(1)
        wf.setsampwidth(2)
        wf.setframerate(samplerate)
        wf.writeframes(audio_data.tobytes())
    
    print(f"✅ Áudio salvo como {filename}")
    return filename

# 🔹 Converter WAV para MP3
def convert_to_mp3(wav_filename):
    mp3_filename = wav_filename.replace(".wav", ".mp3")
    audio = AudioSegment.from_wav(wav_filename)
    audio.export(mp3_filename, format="mp3")
    print(f"🎵 Arquivo convertido para {mp3_filename}")
    return mp3_filename

# 🔹 Transcrever áudio com Whisper
def transcribe_audio(mp3_filename):
    print("📝 Transcrevendo áudio...")
    model = whisper.load_model("small", device="cpu")  # Modelos: "tiny", "base", "small", "medium", "large"
    result = model.transcribe(mp3_filename)
    print("\n📝 Texto reconhecido:")
    print(result["text"])

# 🔹 Executar o processo completo
if __name__ == "__main__":
    wav_file = record_audio(duration=5)  # Grava por 5 segundos
    mp3_file = convert_to_mp3(wav_file)  # Converte para MP3
    transcribe_audio(wav_file)  # Transcreve com Whisper



🎙️ Gravando... Fale algo!
✅ Áudio salvo como output.wav


FileNotFoundError: [WinError 2] El sistema no puede encontrar el archivo especificado

In [1]:
pip install openai-whisper 

Note: you may need to restart the kernel to use updated packages.


In [None]:
import sounddevice as sd
import numpy as np
import scipy.io.wavfile as wavfile
import datetime
import os

def record_audio(duration=5, sample_rate=44100, channels=1):
    """
    Graba audio desde el micrófono durante un tiempo especificado.
    
    Args:
        duration (int): Duración de la grabación en segundos (por defecto 5).
        sample_rate (int): Frecuencia de muestreo (por defecto 44100 Hz).
        channels (int): Número de canales (1 para mono, 2 para estéreo; por defecto 1).
    
    Returns:
        numpy.ndarray: Datos de audio grabados.
    """
    print(f"Grabando audio durante {duration} segundos... Habla al micrófono.")
    
    # Grabar audio
    audio_data = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=channels, dtype='float32')
    sd.wait()  # Esperar hasta que la grabación termine
    
    print("Grabación completada.")
    return audio_data, sample_rate

def save_as_wav(audio_data, sample_rate, output_path):
    """
    Guarda los datos de audio en un archivo WAV.
    
    Args:
        audio_data (numpy.ndarray): Datos de audio grabados.
        sample_rate (int): Frecuencia de muestreo.
        output_path (str): Ruta donde se guardará el archivo WAV.
    """
    # Normalizar y convertir a int16
    audio_data = (audio_data * 32768).astype(np.int16)
    
    # Guardar como WAV
    wavfile.write(output_path, sample_rate, audio_data)
    print(f"Archivo guardado como: {output_path}")

def main():
    # Configuración
    duration = 5  # Duración en segundos
    sample_rate = 44100  # Frecuencia de muestreo estándar
    channels = 1  # Mono
    
    # Generar un nombre de archivo único basado en la fecha y hora
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    output_dir = "recorded_audio"
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, f"recording_{timestamp}.wav")  # Guardamos como WAV
    
    # Grabar audio
    audio_data, sample_rate = record_audio(duration, sample_rate, channels)
    
    # Guardar como WAV
    save_as_wav(audio_data, sample_rate, output_path)

    # Nota: Sin ffmpeg, no podemos convertir a MP3 directamente aquí.
    # Para MP3, necesitarías instalar lameenc o usar ffmpeg (recomendado).
    print("Para convertir a MP3 sin ffmpeg, instala 'lameenc' y descomenta la función save_as_mp3 abajo.")
    # Descomenta y ajusta si instalas lameenc:
    # from lameenc import Encoder
    # save_as_mp3(audio_data, sample_rate, output_path.replace(".wav", ".mp3"))

if __name__ == "__main__":
    main()

Grabando audio durante 5 segundos... Habla al micrófono.
Grabación completada.
Archivo guardado como: recorded_audio\recording_20250309_204635.wav
Para convertir a MP3 sin ffmpeg, instala 'lameenc' y descomenta la función save_as_mp3 abajo.


: 