## Exploratory Analysis of Quiz Bowl Input and the Quality of Speech Recognition 

Works in both Python2.  The below dependencies are needed; all pip installable.  PocketSphynx needed for deciphering sound data locally, and installation only worked on Python2 on MacOS.

TODOs:  Explore "Slow" vs "Fast" TTS speech.  Explore human-speech and not TTS.  Explore Google or MSFT Speech Recognition.  Account for Unicode in data.  

In [2]:
#math

import pandas
from scipy import stats
import numpy as np
from collections import Counter
from scipy import stats

#wrangling + plotting
import csv
import glob 
import matplotlib.pyplot as plt
import pprint
from collections import defaultdict

#speech
from gtts import gTTS
import speech_recognition as sr
import subprocess
import spacy
nlp = spacy.load('en')
import nltk.translate.bleu_score as bleu

In [94]:
r = sr.Recognizer()

with sr.AudioFile('short.wav') as source:              
    audio = r.record(source)

    #PocketSphinx is used locally to decipher the audio
    try:
        list = r.recognize_google(audio,show_all=True)                # generate a list of possible transcriptions
        print("Possible transcriptions:")
        #for prediction in list:
        #    print(" " + prediction["text"] + " (" + str(prediction["confidence"]*100) + "%)")
    except LookupError:                                 # speech is unintelligible
        print("Could not understand audio")
#print audio_data



Possible transcriptions:


In [95]:
list

{u'alternative': [{u'confidence': 0.84984362,
   u'transcript': u'in May 2015 Joe Bertram challenge London mayor Boris Johnson'},
  {u'transcript': u'in a 2015 Joe Bertram challenge London mayor Boris Johnson'},
  {u'transcript': u'in May 2015 Joubert room challenge London mayor Boris Johnson'},
  {u'transcript': u'inmate 2015 Joe Bertram challenge London mayor Boris Johnson'},
  {u'transcript': u'in May 2015 Joe Bertram challenged London mayor Boris Johnson'}],
 u'final': True}

In [93]:
list.get_lattice().write_htk("detailed_lattice")

In [87]:
 print(help(pocketsphinx.pocketsphinx.Lattice))

Help on class Lattice in module pocketsphinx.pocketsphinx:

class Lattice(__builtin__.object)
 |  Proxy of C Lattice struct.
 |  
 |  Methods defined here:
 |  
 |  __del__ lambda self
 |  
 |  __init__(self, *args)
 |      __init__(Lattice self, char const * path) -> Lattice
 |      __init__(Lattice self, Decoder decoder, char * path) -> Lattice
 |  
 |  __repr__ = _swig_repr(self)
 |  
 |  write(self, path)
 |      write(Lattice self, char const * path)
 |  
 |  write_htk(self, path)
 |      write_htk(Lattice self, char const * path)
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)
 |  
 |  thisown
 |      The membership flag
 |  
 |  ----------------------------------------------------------------------
 |  Data and other attributes defined here:
 |  
 |  __swig_destr

In [67]:
list['alternative'][1]

{u'transcript': u'in a 2015 Joe Bertram challenge London mayor Boris Johnson'}

In [54]:
list['alternative'][2]

{u'transcript': u'to visit one of these places that count disguises himself as it goes to the nun who was buried alive there and one of these places a man speech about lust of the flesh is interrupted by the arrival of his three daughters who wear velvet silk and first a Belgian man who has one of these places consentino partly inspired the title character of a posthumous Lee published a tablet reading marks the grave of a character who died'}

## Speech Recognition

Data used:

qb/data/questions/expo/2015_hsnct.csv
qb/data/questions/expo/2015_jennings.csv
qb/data/questions/expo/2015_jennings.power.csv
qb/data/questions/expo/2016_hsnct.csv
qb/data/questions/expo/2016_naacl.csv
qb/data/questions/expo/2017_hsnct.csv
qb/data/questions/expo/2017_hsnct.power.csv

### Store Text in Dict by Document and Convert Text to Speech 

In [None]:
%%time
#~7 minutes

#stores questions by document
storage = defaultdict(lambda : defaultdict(dict))

#loop through each document (only CSV files in the QB data folder)
for each_file in glob.glob('../../qb/data/questions/expo/*.csv'):
    
    print (each_file)
    
    with open(each_file) as f:
        file_storage = []
        data = csv.reader(f)
        
        #dump header
        header = next(data)
        if "text" in header:
            #find proper index of question text
            correct_col = header.index("text")
            
            #keep track of question number
            counter = 0
            for line in data:
                text = (line[correct_col])
                file_storage.append(text)
                sentences = text.split('.')

                try:
                    counter_sent = 0
                    for sentence in sentences:
                        if sentence:
                            #convert into audio with gTTS, save it to mp3, convert it to WAV
                            sentTTS = gTTS(sentence, lang='en', slow=False)
                            file_name = each_file + "_" +str(counter) + "_" + str (counter_sent)
                            sentTTS.save(file_name+".mp3")
                            subprocess.call(['ffmpeg', '-i', file_name+".mp3",
                            file_name + '.wav'])
                            storage[each_file][counter][counter_sent] = sentence 
                            counter_sent +=1
                    counter += 1
                    
                except:
                    print "Processing issue.  Did not load: " + str(line)
                
            

In [None]:
print storage['../../qb/data/questions/expo/2016_naacl.csv'][0][5]

### Decipher Each Audio File with PocketSphinx and Calculate Bleu Score

In [None]:
%%time
#~51 minutes

#keep track of bleu score
processed_speech = defaultdict(lambda : defaultdict(list))
processed_speech_scores = []

record_data = dict()
for each_file in glob.glob('../../qb/data/questions/expo/*.wav'):
    r = sr.Recognizer()
    with sr.AudioFile(each_file) as source:              
        audio = r.record(source)
       
        #PocketSphinx is used locally to decipher the audio
        audio_data = r.recognize_sphinx(audio) 
        #find the appropriate file and question number.  WAV files contain this information
        #lower for bleu score calculation
        
        try:
            file_name = each_file[0:each_file.rfind('_', 0,  each_file.rfind('_'))]
            question_number = int(each_file [each_file.rfind('_',0, each_file.rfind('_'))+1:each_file.rfind('_')])
            sentence_number = int(each_file[each_file.rfind('_')+1:each_file.rfind('.')])
            
            text_data = storage[file_name][question_number][sentence_number].lower()
            processed_speech[file_name][question_number].append(audio_data)
        except:
            print "Issue opening: " + each_file
        
        
        #calculate bleu score with NLTK
        try:
            processed_speech_scores.append(bleu.sentence_bleu(audio_data, text_data))
    
        except: 
            print "Issue with: " + text_data

In [None]:
for i in processed_speech['../../qb/data/questions/expo/2016_naacl.csv']:
    processed_speech['../../qb/data/questions/expo/2016_naacl.csv'][i]

## Results

In [None]:
print stats.describe(processed_speech_scores)

plt.title("Slow QB Questions Bleu Scores")
plt.hist (processed_speech_scores)
plt.style.use('seaborn-pastel')
plt.xlabel("Normalized Bleu Score")
plt.xticks([0.0, 0.25, 0.5, 0.75, 1.0])
plt.ylabel("Frequency")
plt.show()

In [None]:
for k in storage.keys():
    print len(storage[k])

In [None]:
import requests

def answer_question(text):
    response = requests.post(
        'http://trantor.entilzha.io:5000/api/answer_question',
        data={'text': text}
    ).json()
    return response['guess'], response['score']

In [None]:
text = processed_speech['../../qb/data/questions/expo/2016_naacl.csv'][0]
text = ' '.join(text)
answer_question(text)

In [None]:
full = ""
for i in storage['../../qb/data/questions/expo/2016_naacl.csv']:
     print answer_question(' '.join(storage['../../qb/data/questions/expo/2016_naacl.csv'][i].values()))

## Exploration of Quiz Bowl Data

### Word Length

In [None]:
word_len = []
for doc in storage:
    for line in storage[doc]:
        for sentence in storage[doc][line]:
            for word in storage[doc][line][sentence].split():
                word_len.append(len(word))

print (stats.describe(word_len))

plt.figure(figsize=(20, 10))
plt.hist(word_len)
plt.title("QB Word Length Histogram")
x_ticks = [0, 5, 10, 15, 20]
plt.xticks(x_ticks)
plt.show()

### Audio Word Length

In [None]:
word_len = []
for doc in processed_speech:
    for line in processed_speech[doc]:
        for sentence in processed_speech[doc][line]:
            for word in sentence.split():
                word_len.append(len(word))

print (stats.describe(word_len))

plt.figure(figsize=(20, 10))
plt.hist(word_len)
plt.title("QB Word Length Histogram")
x_ticks = [0, 5, 10, 15, 20]
plt.xticks(x_ticks)
plt.show()

### Word Count

In [None]:
#dump everything into single list and run Counter on this.
single_list = []
for doc in storage:
    for line in storage[doc]:
        single_list.append(line) 
wordcount = Counter(' '.join(single_list).split())

a = [x[0] for x in wordcount.most_common(100)]
y = [x[1] for x in wordcount.most_common(100)]


pprint.pprint(a[:20])

x = np.arange(len(a))


plt.figure(figsize=(20, 10))
plt.title("Zipf's Law Holds for QB")
plt.xlabel("Word Rank")
plt.ylabel("Frequency")
plt.bar(x, y, align = 'edge')
plt.show()