In [1]:
import io
import os

In [38]:
from google.cloud import speech
from google.cloud import language_v1

In [25]:
client = speech.SpeechClient.from_service_account_file('key.json')

# The name of the audio file to transcribe
file_name = "files/audio.wav"

# Loads the audio into memory
with io.open(file_name, 'rb') as audio_file:
    content = audio_file.read()
    audio = speech.RecognitionAudio(content=content)

config = speech.RecognitionConfig(
    encoding=speech.RecognitionConfig.AudioEncoding.ENCODING_UNSPECIFIED,
    sample_rate_hertz=48000,
    language_code='en-US')

# Detects speech in the audio file
response = client.recognize(config=config, audio=audio)
print(response)
transcription = ''
for result in response.results:
    transcription += result.alternatives[0].transcript

# Saves the transcribed text to a .txt file in the same directory
file = open("transcription.txt", "w")
file.write(transcription)
file.close()

results {
  alternatives {
    transcript: "English Learners that features natural unedited conversations between native speakers about interesting topics or hot issues or current events today I\'m joined by my co-host Anna Diana hi Andrew and hello listeners now I got a question for you Andrew yes right off the bat I love it what\'s up there\'s a famous seasonal song that often gets played around the holidays I\'m one of the lyrics is it"
    confidence: 0.9480116367340088
  }
  result_end_time {
    seconds: 29
    nanos: 40000000
  }
  language_code: "en-us"
}
total_billed_time {
  seconds: 30
}
request_id: 3698269767929242667



In [40]:
import io
import os

# Imports the Google Cloud client library
from google.cloud import speech

# Instantiates a client
client = speech.SpeechClient.from_service_account_file('key.json')

# The name of the audio file to transcribe
file_name = "files/audio.wav"

# Loads the audio into memory
with io.open(file_name, 'rb') as audio_file:
    content = audio_file.read()
    audio = speech.RecognitionAudio(content=content)
    
diarization_config = speech.SpeakerDiarizationConfig(
    enable_speaker_diarization=True,
    min_speaker_count=2,
    max_speaker_count=10,
)

config = speech.RecognitionConfig(
    encoding=speech.RecognitionConfig.AudioEncoding.ENCODING_UNSPECIFIED,
    sample_rate_hertz=48000,
    language_code='en-US',
    diarization_config=diarization_config,
)

# Detects speech in the audio file and performs speaker diarization
print("Waiting for operation to complete...")
response = client.recognize(config=config, audio=audio)

# Initialize speaker variables

speaker_id = 0
speaker_start = 0.0
dialogue = ""

for result in response.results:
    for word in result.alternatives[0].words:
        if word.speaker_tag != speaker_id:
            speaker_id = word.speaker_tag
            speaker_start = word.start_time.seconds
            speaker_start += word.start_time.seconds / 1000000000.0
            dialogue += "\n Speaker {}: \n".format(speaker_id)
        dialogue += word.word + " "
    speaker_end = result.alternatives[0].words[-1].end_time.seconds
    speaker_end += result.alternatives[0].words[-1].end_time.seconds / 1000000000.0
    dialogue += "\n ({:.2f}s - {:.2f}s)\n".format(speaker_start, speaker_end)

# Saves the transcribed and speaker diarized text to a .txt file in the same directory
file = open("diarization_result.txt", "w")
file.write(dialogue)
file.close()

print(dialogue)

Waiting for operation to complete...
English Learners that features natural unedited conversations between native speakers about interesting topics or hot issues or current events today I'm joined by my co-host Anna Diana hi Andrew and hello listeners now I got a question for you Andrew yes right off the bat I love it what's up there's a famous seasonal song that often gets played around the holidays I'm one of the lyrics is it 
 (0.00s - 28.00s)

 Speaker 1: 
English Learners that features natural unedited conversations between native speakers about interesting topics or hot issues or current events today I'm joined by my co-host Anna 
 Speaker 4: 
Diana hi 
 Speaker 3: 
Andrew and hello listeners now I got a question for you Andrew 
 Speaker 1: 
yes right off the bat I love it what's up 
 Speaker 3: 
there's a famous seasonal song that often gets played around the holidays I'm one of the lyrics 
 Speaker 5: 
is it 
 (28.00s - 28.00s)



In [41]:
print(result)

alternatives {
  words {
    start_time {
    }
    end_time {
      nanos: 500000000
    }
    word: "English"
    speaker_tag: 1
  }
  words {
    start_time {
      nanos: 500000000
    }
    end_time {
      seconds: 1
    }
    word: "Learners"
    speaker_tag: 1
  }
  words {
    start_time {
      seconds: 1
    }
    end_time {
      seconds: 1
      nanos: 600000000
    }
    word: "that"
    speaker_tag: 1
  }
  words {
    start_time {
      seconds: 1
      nanos: 600000000
    }
    end_time {
      seconds: 2
    }
    word: "features"
    speaker_tag: 1
  }
  words {
    start_time {
      seconds: 2
    }
    end_time {
      seconds: 2
      nanos: 300000000
    }
    word: "natural"
    speaker_tag: 1
  }
  words {
    start_time {
      seconds: 2
      nanos: 300000000
    }
    end_time {
      seconds: 3
      nanos: 400000000
    }
    word: "unedited"
    speaker_tag: 1
  }
  words {
    start_time {
      seconds: 3
      nanos: 400000000
    }
    end_time {
 

In [58]:
import argparse
import io
import json
import os

from google.cloud import language_v1
import numpy
import six

def classify(text, verbose=True):
    """Classify the input text into categories."""

    language_client = language_v1.LanguageServiceClient.from_service_account_file('key.json')

    document = language_v1.Document(
        content=text, type_=language_v1.Document.Type.PLAIN_TEXT
    )
    response = language_client.classify_text(request={"document": document})
    categories = response.categories

    result = {}
    topics=""

    for category in categories:
        # Turn the categories into a dictionary of the form:
        # {category.name: category.confidence}, so that they can
        # be treated as a sparse vector.
        result[category.name] = category.confidence

    if verbose:
        for category in categories:
           
            
            topics+=category.name+" \n"
            topics+=str(category.confidence)
            print(topics)

    return result

text="we need to save money why we need to save money to buy a house is so expensive how much do we need to save to save enough for a down payment how much is that it's about $1,000 that will take forever we save every penny okay here's 7 pennies"
classify(dialogue)

/Jobs & Education/Education 
0.9100000262260437
/Jobs & Education/Education 
0.9100000262260437/Reference/Language Resources/Foreign Language Resources 
0.800000011920929


{'/Jobs & Education/Education': 0.9100000262260437,
 '/Reference/Language Resources/Foreign Language Resources': 0.800000011920929}

In [60]:
from google.cloud import speech_v1p1beta1 as speech

client = speech.SpeechClient.from_service_account_file('key.json')

speech_file = "files/audio2.wav"

with open(speech_file, "rb") as audio_file:
    content = audio_file.read()

audio = speech.RecognitionAudio(content=content)

diarization_config = speech.SpeakerDiarizationConfig(
    enable_speaker_diarization=True,
    min_speaker_count=2,
    max_speaker_count=10,
)

config = speech.RecognitionConfig(
    encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
    sample_rate_hertz=48000,
    language_code="en-US",
    diarization_config=diarization_config,
)

print("Waiting for operation to complete...")
response = client.recognize(config=config, audio=audio)

# The transcript within each result is separate and sequential per result.
# However, the words list within an alternative includes all the words
# from all the results thus far. Thus, to get all the words with speaker
# tags, you only have to take the words list from the last result:
result = response.results[-1]

words_info = result.alternatives[0].words

# Printing out the output:
for word_info in words_info:
    print(
        "word: '{}', speaker_tag: {}".format(word_info.word, word_info.speaker_tag)
    )

Waiting for operation to complete...
word: 'English', speaker_tag: 2
word: 'Learners', speaker_tag: 2
word: 'that', speaker_tag: 2
word: 'features', speaker_tag: 2
word: 'natural', speaker_tag: 2
word: 'unedited', speaker_tag: 2
word: 'conversations', speaker_tag: 2
word: 'between', speaker_tag: 2
word: 'native', speaker_tag: 2
word: 'speakers', speaker_tag: 2
word: 'about', speaker_tag: 2
word: 'interesting', speaker_tag: 2
word: 'topics', speaker_tag: 2
word: 'or', speaker_tag: 2
word: 'hot', speaker_tag: 2
word: 'issues', speaker_tag: 2
word: 'or', speaker_tag: 2
word: 'current', speaker_tag: 2
word: 'events', speaker_tag: 2
word: 'today', speaker_tag: 2
word: 'I'm', speaker_tag: 2
word: 'joined', speaker_tag: 2
word: 'by', speaker_tag: 2
word: 'my', speaker_tag: 2
word: 'co-host', speaker_tag: 2
word: 'Anna', speaker_tag: 2
word: 'Diana', speaker_tag: 3
word: 'hi', speaker_tag: 3
word: 'Andrew', speaker_tag: 1
word: 'and', speaker_tag: 1
word: 'hello', speaker_tag: 1
word: 'listene

In [None]:
from pydub import AudioSegment
import webrtcvad
import numpy as np

# Load the audio file
sound = AudioSegment.from_file("path/to/audio.wav", format="wav")

# Convert the audio to a raw PCM format
raw_audio = sound.raw_data

# Set the aggressiveness mode for the VAD (0-3)
aggressiveness = 2

# Create a VAD object
vad = webrtcvad.Vad(aggressiveness)

# Define the frame size and stride in milliseconds
frame_size = 30
frame_stride = 10

# Calculate the number of frames
frame_bytes = int(round(frame_size * sound.frame_rate * sound.channels * sound.sample_width / 1000))
frame_stride_bytes = int(round(frame_stride * sound.frame_rate * sound.channels * sound.sample_width / 1000))

# Split the audio into overlapping frames
frames = [raw_audio[i:i + frame_bytes] for i in range(0, len(raw_audio), frame_stride_bytes)]

# Initialize an array to store the speaker labels
labels = np.zeros(len(frames))

# Initialize the current speaker label
current_label = 0

# Iterate over each frame
for i, frame in enumerate(frames):
    # Use the VAD to check if the frame contains speech
    is_speech = vad.is_speech(frame, sound.frame_rate)

    # If the frame contains speech, assign it the current speaker label
    if is_speech:
        labels[i] = current_label

        # Increment the current speaker label
        current_label += 1

# Print the speaker labels
print(labels)
