# Task 1-1. Feature Extraction
Extract two feature sets that you feel would be useful for the DAR problem. One feature
set should be text-based, and the other feature set should be speech-based. Save text-based and speech-based feature sets as text_features_{train, valid, test}.csv and speech_features_{train, valid, test}.csv, respectively.

- columns: dialog_id, speaker, da_tag, start_time, end_time

In [2]:
!pip install praat-parselmouth

Collecting praat-parselmouth
  Downloading praat_parselmouth-0.4.5-cp312-cp312-macosx_11_0_arm64.whl.metadata (2.9 kB)
Downloading praat_parselmouth-0.4.5-cp312-cp312-macosx_11_0_arm64.whl (8.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.7/8.7 MB[0m [31m40.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: praat-parselmouth
Successfully installed praat-parselmouth-0.4.5


In [3]:
import pandas as pd
import numpy as np
import os
import parselmouth
from parselmouth.praat import call

## Feature extraction

In [4]:
train_df = pd.read_csv("../train.csv")
valid_df = pd.read_csv("../valid.csv")
test_df = pd.read_csv("../test.csv")

In [7]:
print(train_df.columns)

Index(['dialog_id', 'speaker', 'transcript', 'da_tag', 'start_time',
       'end_time', 'function', 'pronoun', 'ppron', 'i', 'we', 'you', 'shehe',
       'they', 'ipron', 'article', 'prep', 'auxverb', 'adverb', 'conj',
       'negate', 'verb', 'adj', 'compare', 'interrog', 'number', 'quant',
       'affect', 'posemo', 'negemo', 'anx', 'anger', 'sad', 'social', 'family',
       'friend', 'female', 'male', 'cogproc', 'insight', 'cause', 'discrep',
       'tentat', 'certain', 'differ', 'percept', 'see', 'hear', 'feel', 'bio',
       'body', 'health', 'sexual', 'ingest', 'drives', 'affiliation',
       'achieve', 'power', 'reward', 'risk', 'focuspast', 'focuspresent',
       'focusfuture', 'relativ', 'motion', 'space', 'time', 'work', 'leisure',
       'home', 'money', 'relig', 'death', 'informal', 'swear', 'netspeak',
       'assent', 'nonflu', 'filler'],
      dtype='object')


In [8]:
print(train_df.head())

  dialog_id speaker                                         transcript da_tag  \
0    sw2005       B                                                SIL      x   
1    sw2005       A                                                SIL      x   
2    sw2005       B  well of course it's you know it's one of the l...     sv   
3    sw2005       A                                                yes      b   
4    sw2005       A                                                SIL      x   

   start_time  end_time  function   pronoun     ppron    i  ...  home  money  \
0     0.00000  10.94882  0.000000  0.000000  0.000000  0.0  ...   0.0    0.0   
1    10.93013  21.35084  0.000000  0.000000  0.000000  0.0  ...   0.0    0.0   
2    10.94882  22.28000  0.642857  0.285714  0.190476  0.0  ...   0.0    0.0   
3    21.35084  22.08088  0.000000  0.000000  0.000000  0.0  ...   0.0    0.0   
4    22.08088  44.96288  0.000000  0.000000  0.000000  0.0  ...   0.0    0.0   

   relig  death  informal  swear

### Speech-based features
- base columns: ["dialog_id", "speaker", "da_tag", "start_time", "end_time"]

In [9]:
print(len(os.listdir("../wav")))
print(os.listdir("../wav")[:5])

1016
['sw4733_A.wav', 'sw3223_A.wav', 'sw2205_B.wav', 'sw3691_B.wav', 'sw4038_B.wav']


In [11]:
def extractPitch(filename, time_step = 0.01, pitch_floor = 75, pitch_ceiling=600):
  pitch = call(filename, "To Pitch", time_step, pitch_floor, pitch_ceiling)

  min_time, max_time, unit, interpolation_method = 0.0, 0.0, "Hertz", "Parabolic"
  pitch_min = call(pitch, "Get minimum", min_time, max_time, unit, interpolation_method)
  pitch_max = call(pitch, "Get maximum", min_time, max_time, unit, interpolation_method)
  pitch_mean = call(pitch, "Get mean", min_time, max_time, unit)
  pitch_std = call(pitch, "Get standard deviation", min_time, max_time, unit)

  return pitch_min, pitch_max, pitch_mean, pitch_std

In [12]:
def extractIntensity(filename, time_step=0.01, pitch_floor=100):
  intensity = call(filename, "To Intensity", pitch_floor, time_step)

  intensity_min = call(intensity, "Get minimum", 0, 0, "Parabolic")
  intensity_max = call(intensity, "Get maximum", 0, 0, "Parabolic")
  intensity_mean = call(intensity, "Get mean", 0, 0)
  intensity_std = call(intensity, "Get standard deviation", 0, 0)

  return intensity_min, intensity_max, intensity_mean, intensity_std

In [13]:
def get_speaking_rate(filename, transcript):
  duration = call(filename, "Get total duration")

  word_count = len(transcript.split())
  speaking_rate = word_count / duration

  return speaking_rate

In [14]:
def extractJitter(filename, pitch_floor = 75, pitch_ceiling=600, period_floor=0.0001, period_ceiling=0.02, maximum_period_factor=1.3):
  min_time, max_time = 0.0, 0.0

  point_process = call(filename, "To PointProcess (periodic, cc)", pitch_floor, pitch_ceiling)
  jitter = call(point_process, "Get jitter (local)",
                min_time, max_time, period_floor, period_ceiling, maximum_period_factor)

  return jitter

In [15]:
def extractShimmer(filename, pitch_floor = 75, pitch_ceiling=600, period_floor=0.0001, period_ceiling=0.02, maximum_period_factor=1.3, maximum_amplitude=1.6):
  min_time, max_time = 0.0, 0.0

  point_process = call(filename, "To PointProcess (periodic, cc)", pitch_floor, pitch_ceiling)
  shimmer = call([filename, point_process], "Get shimmer (local)",
                  min_time, max_time, period_floor,period_ceiling, maximum_period_factor, maximum_amplitude)

  return shimmer

In [16]:
def extractHNR(filename, time_step = 0.01, minimum_pitch = 75, silence_threshold = 0.1, periods = 1.0):
  hnr = call(filename, "To Harmonicity (cc)",
            time_step, minimum_pitch, silence_threshold, periods)
  min_time, max_time = 0.0, 0.0
  hnr_value = call(hnr, "Get mean", min_time, max_time)

  return hnr_value

In [24]:
def audio_process(df, wav_folder):
  rows = []

  for _, row in df.iterrows():
    dialog_id = row['dialog_id']
    speaker = row['speaker']
    da_tag = row.get('da_tag', '')
    start_time = row['start_time']
    end_time = row['end_time']
    transcript = row.get('transcript', '')

    wav_filepath = os.path.join(wav_folder, f"{dialog_id}_{speaker}.wav")
    if not os.path.exists(wav_filepath):
      print("No file")
      continue

    try:
        sound = parselmouth.Sound(wav_filepath)

        segment = sound.extract_part(from_time=start_time, to_time=end_time, preserve_times=True)
        
        pitch_min, pitch_max, pitch_mean, pitch_std = extractPitch(segment)
        intensity_min, intensity_max, intensity_mean, intensity_std = extractIntensity(segment)
        speaking_rate = get_speaking_rate(segment, transcript)
        jitter = extractJitter(segment)
        shimmer = extractShimmer(segment)
        hnr = extractHNR(segment)
        
    except Exception as e:
        print(f"  Error on {dialog_id}_{speaker}: {e}")
        pitch_min = pitch_max = pitch_mean = pitch_std = None
        intensity_min = intensity_max = intensity_mean = intensity_std = None
        speaking_rate = jitter = shimmer = hnr = None
        
    rows.append({
          "dialog_id": dialog_id,
          "speaker": speaker,
          "da_tag": da_tag,
          "start_time": start_time,
          "end_time": end_time,
          "Min Pitch": pitch_min,
          "Max Pitch": pitch_max,
          "Mean Pitch": pitch_mean,
          "Std Pitch": pitch_std,
          "Min Intensity": intensity_min,
          "Max Intensity": intensity_max,
          "Mean Intensity": intensity_mean,
          "Std Intensity": intensity_std,
          "Speaking Rate": speaking_rate,
          "Jitter": jitter,
          "Shimmer": shimmer,
          "HNR": hnr
          })
    print(f"  Complete {dialog_id}_{speaker} [{start_time} ~ {end_time}]")

  return pd.DataFrame(rows)


In [25]:
print(len(train_df))
print(len(valid_df))
print(len(test_df))

74111
19156
23540


In [26]:
print(train_df.tail()['dialog_id'].unique())

['sw4940']


In [27]:
train_df_speech = audio_process(train_df, wav_folder="../wav")
print("Complete Train dataset:", len(train_df_speech))

  Complete sw2005_B [0.0 ~ 10.94882]
  Complete sw2005_A [10.93013 ~ 21.35084]
  Complete sw2005_B [10.94882 ~ 22.28]
  Complete sw2005_A [21.35084 ~ 22.08088]
  Complete sw2005_A [22.08088 ~ 44.96288]
  Complete sw2005_B [22.28 ~ 26.72]
  Complete sw2005_B [26.72 ~ 27.41913]
  Complete sw2005_B [27.41913 ~ 29.76]
  Complete sw2005_B [29.76 ~ 32.90752]
  Complete sw2005_B [35.87511 ~ 38.82891]
  Complete sw2005_B [38.82891 ~ 45.66602]
  Complete sw2005_A [44.96288 ~ 45.64818]
  Complete sw2005_A [45.64818 ~ 78.02339]
  Complete sw2005_B [45.66602 ~ 50.77305]
  Complete sw2005_B [50.77305 ~ 51.82206]
  Complete sw2005_B [51.82206 ~ 63.73905]
  Complete sw2005_B [63.73905 ~ 65.36778]
  Complete sw2005_B [65.36778 ~ 67.44]
  Complete sw2005_B [67.44 ~ 72.08]
  Complete sw2005_B [72.08 ~ 75.6634]
  Complete sw2005_B [75.6634 ~ 78.08]
  Complete sw2005_A [78.02339 ~ 78.44187]
  Complete sw2005_A [78.44187 ~ 86.45807]
  Complete sw2005_B [79.44 ~ 81.12]
  Complete sw2005_B [81.12 ~ 84.60204]

In [28]:
print(train_df_speech.head())

  dialog_id speaker da_tag  start_time  end_time  Min Pitch   Max Pitch  \
0    sw2005       B      x     0.00000  10.94882  87.138127  172.838418   
1    sw2005       A      x    10.93013  21.35084  76.595613  579.072883   
2    sw2005       B     sv    10.94882  22.28000  76.264618  581.796010   
3    sw2005       A      b    21.35084  22.08088  98.272004  132.748855   
4    sw2005       A      x    22.08088  44.96288  74.794815  575.231786   

   Mean Pitch  Std Pitch  Min Intensity  Max Intensity  Mean Intensity  \
0  116.408650  17.341471      13.480567      52.796285       27.565118   
1  112.604750  80.076506      27.405445      45.109786       32.620230   
2  119.122382  92.663834      14.425464      62.933620       36.445984   
3  107.849507   7.938820      29.174030      61.115579       40.632769   
4  103.772805  46.224186      26.817726      47.921699       33.105585   

   Std Intensity  Speaking Rate    Jitter   Shimmer        HNR  
0       9.260265       0.091334  0.0132

In [29]:
train_df_speech.to_csv("speech_features_train.csv", index=False)

In [None]:
# valid_df_speech = audio_process(valid_df, wav_folder="../wav")
# print("Complete Validation dataset:", len(valid_df_speech))

In [None]:
# valid_df_speech.to_csv("speech_features_valid.csv", index=False)

In [None]:
# test_df_speech = audio_process(test_df, wav_folder="../wav")
# print("Complete Test dataset:", len(test_df_speech))

In [None]:
# test_df_speech.to_csv("speech_features_test.csv", index=False)

# Task 1-2. Feature Analysis
For each custom feature set (text-feature set and speech feature set), formulate and test a hypothesis about the features (visually or statistically). Observe if the results are in accordance with your hypothesis or not. Give an explanation about your thinking behind the observed behavior.

For example, testing whether the LIWC feature “Insight”, which is associated with words such as “think” and “know”, or the bigram “I think” are useful in predicting the dialogue act “Statement-opinion”. This hypothesis could be tested by plotting average values of the LIWC “insight features or “I think” bigram for the top 10 dialogue acts.