In [None]:
from google.colab import files
uploaded = files.upload()

Saving trained_models.zip to trained_models.zip


In [None]:
!unzip /content/trained_models.zip -d /content/model_directory

Archive:  /content/trained_models.zip
  inflating: /content/model_directory/main_classifier_model.pth  
   creating: /content/model_directory/dialog_act_model/
  inflating: /content/model_directory/dialog_act_model/model.safetensors  
  inflating: /content/model_directory/dialog_act_model/tokenizer_config.json  
  inflating: /content/model_directory/dialog_act_model/vocab.txt  
  inflating: /content/model_directory/dialog_act_model/special_tokens_map.json  
  inflating: /content/model_directory/dialog_act_model/config.json  


In [None]:
# Install dependencies
!pip install torch transformers numpy

# Import necessary libraries
import torch
import torch.nn as nn
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import numpy as np


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [None]:
# Configuration
DIALOG_ACT_MODEL_DIR = "/content/model_directory/dialog_act_model"
MAIN_CLASSIFIER_PATH = "/content/model_directory/main_classifier_model.pth"
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Emotion mapping (from DailyDialog)
EMOTION_MAP = {
    'anger': 0, 'disgust': 1, 'fear': 2, 'joy': 3, 'sadness': 4,
    'surprise': 5, 'neutral': 6
}


In [None]:


# Define MainClassifier (from Cell 7)
class MainClassifier(nn.Module):
    def __init__(self, input_dim=768+4+7):
        super(MainClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 256)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 2)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        return x

# Assuming a logger is set up elsewhere, if not, you might need a basic one
# import logging
# logging.basicConfig(level=logging.INFO)
# logger = logging.getLogger(__name__)

# Load models
try:
    # Check if tokenizer and model were loaded successfully before using them as default args
    # It is better to pass these as arguments explicitly rather than relying on defaults
    # which depend on previous lines of code.
    tokenizer = DistilBertTokenizer.from_pretrained(DIALOG_ACT_MODEL_DIR)
    model = DistilBertForSequenceClassification.from_pretrained(DIALOG_ACT_MODEL_DIR).to(DEVICE)
    model.eval()
    # logger.info("Dialogue act model (DistilBERT) loaded successfully") # Uncomment if logger is available

    main_model = MainClassifier(input_dim=768+4+7).to(DEVICE)
    main_model.load_state_dict(torch.load(MAIN_CLASSIFIER_PATH, map_location=DEVICE))
    main_model.eval()
    # logger.info("MainClassifier loaded successfully") # Uncomment if logger is available
except Exception as e:
    # logger.error(f"Error loading models: {e}") # Uncomment if logger is available
    print(f"Error loading models: {e}") # Use print if logger is not available
    raise

# Modified predict_directed with emotion input
# It is generally better practice to pass models and tokenizers explicitly
# rather than relying on default arguments set based on global variables.
def predict_directed(text, prev_text="", emotion="neutral", tokenizer=None, model=None, main_model=None):
    # Add checks to ensure tokenizer, model, and main_model are provided or loaded
    if tokenizer is None or model is None or main_model is None:
        # logger.error("Models or tokenizer not provided.") # Uncomment if logger is available
        print("Models or tokenizer not provided.") # Use print if logger is not available
        raise ValueError("Models and tokenizer must be provided to the predict_directed function.")

    if not isinstance(text, str) or not text.strip():
        # logger.error(f"Invalid input text: {text}") # Uncomment if logger is available
        raise ValueError("Input text must be a non-empty string")
    if not isinstance(prev_text, str):
        # logger.warning(f"Invalid previous text: {prev_text}, using empty string") # Uncomment if logger is available
        prev_text = ""

    # Handle emotion input
    if isinstance(emotion, str):
        emotion_idx = EMOTION_MAP.get(emotion.lower(), 6)  # Default to neutral
    elif isinstance(emotion, int) and 0 <= emotion <= 6:
        emotion_idx = emotion
    else:
        # logger.warning(f"Invalid emotion: {emotion}, using neutral") # Uncomment if logger is available
        emotion_idx = 6

    combined_text = f"{prev_text.strip() + ' ' if prev_text.strip() else ''}[SEP] {text}"
    inputs = tokenizer(combined_text, return_tensors='pt', padding=True, truncation=True).to(DEVICE)

    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
        if hasattr(outputs, 'hidden_states') and isinstance(outputs.hidden_states, tuple) and len(outputs.hidden_states) > 0:
            cls_embedding = outputs.hidden_states[-1][:, 0, :]
        elif hasattr(outputs, 'last_hidden_state'):
            cls_embedding = outputs.last_hidden_state[:, 0, :]
        else:
            # logger.error("Model output does not have expected hidden state attribute.") # Uncomment if logger is available
            # logger.debug(f"Model outputs keys: {outputs.keys()}") # Uncomment if logger is available
            raise AttributeError("Model output does not have expected hidden state attribute.")

        logits = outputs.logits
        pred_act = torch.argmax(logits, dim=1).item()

    one_hot_act = np.eye(4)[pred_act]
    one_hot_emotion = np.eye(7)[emotion_idx]  # Use specified emotion
    feature = np.concatenate([cls_embedding.cpu().numpy(), one_hot_act[None, :], one_hot_emotion[None, :]], axis=1)
    feature_tensor = torch.tensor(feature, dtype=torch.float32).to(DEVICE)

    with torch.no_grad():
        output = main_model(feature_tensor)
        _, prediction = torch.max(output, 1)
        label = prediction.item()

    # logger.debug(f"Inference: text='{text}', prev_text='{prev_text}', emotion='{emotion}', label={label}") # Uncomment if logger is available
    return "Directed" if label == 1 else "Not directed"



In [None]:
# Test with your example
text = "can you make a call?"
prev_text = "I'm so tired after work."
emotion = "neutral"  # Default test
# Pass the loaded models and tokenizer to the function
result = predict_directed(text, prev_text, emotion=emotion, tokenizer=tokenizer, model=model, main_model=main_model)
print(f"Text: '{text}'")
print(f"Previous Text: '{prev_text}'")
print(f"Emotion: '{emotion}'")
print(f"Prediction: {result}")

# Test with different emotion
emotion = "Sadness"
# Pass the loaded models and tokenizer to the function
result = predict_directed(text, prev_text, emotion=emotion, tokenizer=tokenizer, model=model, main_model=main_model)
print(f"\nText: '{text}'")
print(f"Previous Text: '{prev_text}'")
print(f"Emotion: '{emotion}'")
print(f"Prediction: {result}")


Text: 'can you make a call?'
Previous Text: 'I'm so tired after work.'
Emotion: 'neutral'
Prediction: Directed

Text: 'can you make a call?'
Previous Text: 'I'm so tired after work.'
Emotion: 'Sadness'
Prediction: Not directed


In [None]:
# Test with your example
text = "What is the time?"
prev_text = "What is the weather today ?"
emotion = "neutral"  # Default test
# Pass the loaded models and tokenizer to the function
result = predict_directed(text, prev_text, emotion=emotion, tokenizer=tokenizer, model=model, main_model=main_model)
print(f"Text: '{text}'")
print(f"Previous Text: '{prev_text}'")
print(f"Emotion: '{emotion}'")
print(f"Prediction: {result}")

# Test with different emotion
emotion = "Sadness"
# Pass the loaded models and tokenizer to the function
result = predict_directed(text, prev_text, emotion=emotion, tokenizer=tokenizer, model=model, main_model=main_model)
print(f"\nText: '{text}'")
print(f"Previous Text: '{prev_text}'")
print(f"Emotion: '{emotion}'")
print(f"Prediction: {result}")


Text: 'What is the time?'
Previous Text: 'What is the weather today ?'
Emotion: 'neutral'
Prediction: Not directed

Text: 'What is the time?'
Previous Text: 'What is the weather today ?'
Emotion: 'Sadness'
Prediction: Not directed


In [None]:
EMOTION_CLASSIFIER_MAP = {
    'ang': 'anger', 'dis': 'disgust', 'fea': 'fear', 'hap': 'joy',
    'sad': 'sadness', 'neu': 'neutral', 'sur': 'neutral'  # Map surprise to neutral
}

In [None]:
from transformers import pipeline, AutoFeatureExtractor, HubertForSequenceClassification

try:
    # Whisper for transcription
    transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=DEVICE)
    logger.info("Whisper transcription model loaded successfully")

    # Hubert for emotion classification (use feature extractor)
    emotion_processor = AutoFeatureExtractor.from_pretrained("superb/hubert-large-superb-er")
    emotion_model = HubertForSequenceClassification.from_pretrained("superb/hubert-large-superb-er").to(DEVICE)
    emotion_model.eval()
    logger.info("Hubert emotion model loaded successfully")

except Exception as e:
    logger.error(f"Error loading models: {e}")
    raise


Device set to use cuda


pytorch_model.bin:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

In [None]:
# Audio transcription function
def transcribe_audio(audio_path):
    try:
        transcription = transcriber(audio_path)["text"]
        logger.info(f"Transcribed audio from {audio_path}: {transcription}")
        return transcription
    except Exception as e:
        logger.error(f"Error transcribing audio from {audio_path}: {e}")
        raise

In [None]:
def extract_emotion(audio_path, processor, emotion_model):
    try:
        waveform, sample_rate = torchaudio.load(audio_path)
        if sample_rate != 16000:
            resampler = torchaudio.transforms.Resample(sample_rate, 16000)
            waveform = resampler(waveform)
        waveform = waveform.squeeze(0).to(DEVICE)
        inputs = processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True).to(DEVICE)
        with torch.no_grad():
            logits = emotion_model(**inputs).logits
        predicted_id = torch.argmax(logits, dim=-1).item()
        # Map Hubert emotion labels to DailyDialog emotions
        emotion_labels = ['ang', 'dis', 'fea', 'hap', 'neu', 'sad', 'sur']
        predicted_emotion = emotion_labels[predicted_id]
        mapped_emotion = EMOTION_CLASSIFIER_MAP.get(predicted_emotion, 'neutral')
        logger.info(f"Extracted emotion from {audio_path}: {mapped_emotion}")
        return mapped_emotion
    except Exception as e:
        logger.error(f"Error extracting emotion from {audio_path}: {e}")
        raise

In [None]:
text_audio_path = "/content/Cnyoutell.wav"  # Update with your uploaded WAV file for text
prev_text_audio_path = "/content/whatsthetime.wav"  # Update with your uploaded WAV file for prev_text
try:
    transcribed_text = transcribe_audio(text_audio_path)
    transcribed_prev_text = transcribe_audio(prev_text_audio_path)
    extracted_emotion = extract_emotion(text_audio_path, emotion_processor, emotion_model)
    result = predict_directed(
        text=transcribed_text,
        prev_text=transcribed_prev_text,
        emotion=extracted_emotion,
        tokenizer=tokenizer,
        model=model,
        main_model=main_model
    )
    print(f"\nTranscribed Text: '{transcribed_text}'")
    print(f"Transcribed Previous Text: '{transcribed_prev_text}'")
    print(f"Extracted Emotion: '{extracted_emotion}'")
    print(f"Prediction: {result}")
except Exception as e:
    logger.error(f"Error processing audio: {e}")




Transcribed Text: ' Can you tell me the weather?'
Transcribed Previous Text: ' What is the time?'
Extracted Emotion: 'disgust'
Prediction: Directed
