In [140]:
%matplotlib inline
import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from ipywidgets import interact, interactive, fixed, widgets
from NotebookUtils.ProgressBar import LogProgress as LP
plt.rcParams['font.family']='serif'
plt.rcParams['font.weight']='light'
plt.rcParams['font.size']=14
figsize = (12,6)
dataDir = '/Users/sdporzio/Data/Accento/'

from bs4 import BeautifulSoup
import requests

In [141]:
urlHome = 'http://accent.gmu.edu/'

def CheckOrCreateDir(path):
    if not os.path.isdir(path):
        os.makedirs(path)

def GetSoup(url):
    html = requests.get(url).text
    soup = BeautifulSoup(html, 'html5lib')
    return soup

def GetLanguageURL(language):
    url = urlHome + 'browse_language.php?function=find&language=%s' %language
    return url

def GetSpeakerURL(language):
    url = GetLanguageURL(language)
    soup = GetSoup(url)
    p_v = soup.find_all('p')[1:] # First p is number of results
    href_v = []
    for p in p_v:
        href = urlHome+p.find('a').get('href')
        href_v.append(href)
    return href_v

def GetSpeakerAudioURL(url):
    soup = GetSoup(url)
    href = soup.find_all('source')[0].get('src').replace('http://chnm.gmu.edu/accent/','http://accent.gmu.edu/')
    return href

def GetSpeakerMetadata(url):
    soup = GetSoup(url)
    li_v = soup.find_all('li')
    liPlus_v = [li for li in li_v if li.find('em')]
    info = {}
    info['id'] = url.split('=')[2]
    for li in liPlus_v:
        key = str(li.contents[0].text.replace(':',''))
        value = str(li.contents[1])
        if key == "other language(s) ":
            key = 'other languages'
            value = value.split()
        if key == 'length of english residence':
            value = value.replace(' years','')
        if key == 'age, sex':
            value = value.split()
            info['age'] = value[0].replace(',','')
            info['sex'] = value[1]
        else:
            info[key] = value
    return info

def DownloadCompleteData(nFile,language):
    path = dataDir+language.capitalize()+'/'
    CheckOrCreateDir(path)
    url = GetSpeakerURL(language)
    if nFile == 0:
        nFile = len(url)
    for i in LP(range(nFile)):
        info = pd.Series(GetSpeakerMetadata(url[i]))

        outFile = path+language+info['id']+'.json'
        info.to_json(outFile)

        outFile = path+language+info['id']+'.mp3'
        audioUrl = GetSpeakerAudioURL(url[i])
        os.system("curl '%s' -o '%s'" %(audioUrl,outFile))

In [143]:
languages = ['russian','german','spanish','french']
for lang in languages:
    DownloadCompleteData(0,lang)