### 1. Upload the mp4 file into S3.

### 2. Run a Transcribe job on the mp4 file, specifying the number of speakers ideally.

### 3. Copy the Transcribe job results to this notebook to parse.

In [11]:
!aws s3 cp s3://mansplaining/transcribe-wav.json .

In [12]:
import json
import boto3
import sagemaker
import os
import pandas as pd

In [None]:
def get_data(file_name):
    with open(file_name) as f:
        data = json.load(f)
    return data
    
data = get_data('transcribe-wav.json')['results']

In [None]:
def get_times(data):
    # must return a list of tuples like (start, duration)
    rt = []
    for segment in data['speaker_labels']['segments']:
        start_time = float(segment['start_time'])
        end_time = float(segment['end_time'])
        
        # get a 1/2 second snippet after each speaker starts
        delta = .5
                
        rt.append((start_time, delta))
                  
    return rt

times = get_times(data)

### 4. Configure an Elastic Transcoder job 

In [None]:
client = boto3.client('elastictranscoder')

In [None]:
# pipeline = client.create_pipeline(
#     Name='pipeline',
#     InputBucket='mansplaining',
#     OutputBucket='mansplaining',


In [None]:
# run this to get the pipeline ID
pipeline

In [None]:
def get_inputs(times):
    inputs = []
    
    for start_time, duration in times:
        
        json_obj = { 'Key': 'audio/meeting_audio.wav',
                    'TimeSpan': {
                        'StartTime': str(start_time),
                        'Duration': str(duration)}
                    }
        inputs.append(json_obj)
                    
    return inputs 
        
inputs = get_inputs(times)

In [None]:
def get_outputs(times):
    rt = []
    for idx in range(len(times)):
        obj = {'Key' : 'micro_clip_{}.wav'.format(idx),
               
        # configure this by looking at the console based on the type of audio file you want 
         'PresetId':'1351620000001-300300'}

        rt.append(obj)
        
    return rt

outputs = get_outputs(times)

In [None]:
# need to loop through batches of 30 
iterator = round(len(inputs)/30)

for i in range(iterator):
    lb = i * 30
    up = lb + 30
    if up > len(inputs):
        up = len(inputs)
        
#     try:
    response = client.create_job(
        # get this from where you printed out pipeline above 
        PipelineId='1568399213667-jzukmi',
        Inputs=inputs[lb:up],
        Outputs = outputs[lb:up] )
    
#     break
#     except:
#         print ('failed on range', lb, up)
        

### 5. Copy results from Transcoding job to Notebook instance

In [None]:
!aws s3 sync s3://mansplaining Data

### 6. Loop through data and hit SageMaker endpoint

In [22]:
%%bash

for i in {1..291};do 
    echo $i;
    encoded_string=$(base64 ./Data/micro_clip_$i.wav);payload="{\"instances\": [{\"audio\": {\"b64\": \"$encoded_string\"}}]}";
    echo $payload >Input/input_$i.json;
    aws sagemaker-runtime invoke-endpoint  --endpoint-name 'gender-clasifier'  --body fileb://Input/input_$i.json --content-type "application/json" --region us-east-1 Output/output_$i.json;
    
done

1
{
    "ContentType": "application/json",
    "InvokedProductionVariant": "variant-name-1"
}
2
{
    "ContentType": "application/json",
    "InvokedProductionVariant": "variant-name-1"
}
3
{
    "ContentType": "application/json",
    "InvokedProductionVariant": "variant-name-1"
}
4
{
    "ContentType": "application/json",
    "InvokedProductionVariant": "variant-name-1"
}
5
{
    "ContentType": "application/json",
    "InvokedProductionVariant": "variant-name-1"
}
6
{
    "ContentType": "application/json",
    "InvokedProductionVariant": "variant-name-1"
}
7
{
    "ContentType": "application/json",
    "InvokedProductionVariant": "variant-name-1"
}
8
{
    "ContentType": "application/json",
    "InvokedProductionVariant": "variant-name-1"
}
9
{
    "ContentType": "application/json",
    "InvokedProductionVariant": "variant-name-1"
}
10
{
    "ContentType": "application/json",
    "InvokedProductionVariant": "variant-name-1"
}
11
{
    "ContentType": "application/json",
    "InvokedPro

### 7. Consolidate Results and Generate Statistics

In [22]:
json_objs = []
male_count = 0
female_count = 0

for f in os.listdir('Output'):
    file = 'Output/{}'.format(f)

    with open(file) as fp:
        data = json.load(fp)
        pred = data['predictions'][0]['label']
        if 'female' in pred:
            female_count += 1
        else:
            male_count += 1

In [23]:
percent_male = 100 * male_count / (male_count + female_count)
percent_female = 100 * female_count / (male_count + female_count)


print ('Percentage of time men were speaking was {}%, while for women it was {}%.'.format(percent_male, percent_female))

Percentage of time men were speaking was 20.89041095890411%, while for women it was 79.10958904109589%.
