In [16]:
import urllib
import time
import shutil
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re

In [2]:
# from http://accent.gmu.edu/browse_language.php, return list of languages
def get_languages():
    """ This will return all the languages present in website
        http://accent.gmu.edu"""

    url = "http://accent.gmu.edu/browse_language.php"
    html = get(url)
    soup = BeautifulSoup(html.content, 'html.parser')
    languages = []
    language_lists = soup.findAll('ul', attrs={'class': 'languagelist'})
    for ul in language_lists:
        for li in ul.findAll('li'):
            languages.append(li.text)
    return languages

In [8]:
langs = get_languages()

In [9]:
langs

['aceh',
 'afrikaans',
 'agni',
 'agny',
 'akan',
 'albanian',
 'amazigh',
 'american sign language',
 'amharic',
 'ancient greek',
 'antigua and barbuda creole english',
 'anyin',
 'appolo',
 'arabic',
 'aramaic',
 'armenian',
 'aromanian',
 'ashanti',
 'asl',
 'azerbaijani',
 'azerbaijani, south',
 'azeri turk',
 'babur',
 'bafang',
 'baga',
 'bahasa indonesia',
 'bai',
 'balant',
 'balanta ganja',
 'bamanankan',
 'bambara',
 'bamun',
 'banganthe',
 'bangla',
 'baoule',
 'bari',
 'basque',
 'bassa',
 'bavarian',
 'belarusan',
 'bengali',
 'bikol',
 'bislama',
 'bosnian',
 'bulgarian',
 'burmese',
 'cameroon creole english',
 'cantonese',
 'carolinian',
 'castellano',
 'catalan',
 'cebuano',
 'chaam',
 'chagga',
 'chaldean',
 'chaldean neo aramaic',
 'chamorro',
 'charapa-spanish',
 'chichewa',
 'chin, mizo',
 'chinese',
 'chittagonian',
 'chuukese',
 'classical greek',
 'cotocoli',
 'creole',
 'creole french',
 'crioulo',
 'croatian',
 'czech',
 'danish',
 'dari',
 'darija',
 'dholuo

In [3]:
# from list of languages, return urls of each language landing page
def lang_pages(lst):
    """ This will return all the landing pages for the list of languages obtained
        from the above function"""

    urls=[]
    for lang in lst:
        urls.append('http://accent.gmu.edu/browse_language.php?function=find&language={}'.format(lang))
    return urls

In [10]:
langs_url = lang_pages(langs)

In [11]:
langs_url

['http://accent.gmu.edu/browse_language.php?function=find&language=aceh',
 'http://accent.gmu.edu/browse_language.php?function=find&language=afrikaans',
 'http://accent.gmu.edu/browse_language.php?function=find&language=agni',
 'http://accent.gmu.edu/browse_language.php?function=find&language=agny',
 'http://accent.gmu.edu/browse_language.php?function=find&language=akan',
 'http://accent.gmu.edu/browse_language.php?function=find&language=albanian',
 'http://accent.gmu.edu/browse_language.php?function=find&language=amazigh',
 'http://accent.gmu.edu/browse_language.php?function=find&language=american sign language',
 'http://accent.gmu.edu/browse_language.php?function=find&language=amharic',
 'http://accent.gmu.edu/browse_language.php?function=find&language=ancient greek',
 'http://accent.gmu.edu/browse_language.php?function=find&language=antigua and barbuda creole english',
 'http://accent.gmu.edu/browse_language.php?function=find&language=anyin',
 'http://accent.gmu.edu/browse_language

In [4]:
# from language, get the number of speakers of that language
def get_num(language):
    url = 'http://accent.gmu.edu/browse_language.php?function=find&language=' + language
    html = get(url)
    soup = BeautifulSoup(html.content, 'html.parser')
    test = soup.find_all('div', attrs={'class': 'content'})
    try:
        num = int(test[0].find('h5').text.split()[2])
    except AttributeError:
        num = 0
    return num

In [5]:
# from list of languages, return list of tuples (LANGUAGE, LANGUAGE_NUM_SPEAKERS) for mp3getter, ignoring languages
# with 0 speakers
def get_formatted_languages(languages):
    formatted_languages = []
    for language in languages:
        num = get_num(language)
        if num != 0:
            formatted_languages.append((language,num))
    return pd.DataFrame(formatted_languages, columns=["Language", "No. Of Speakers"]).sort_values(by="No. Of Speakers", ascending=False).reset_index(drop=True)

In [12]:
num_speakers = get_formatted_languages(langs)

In [14]:
num_speakers.head()

Unnamed: 0,Language,No. Of Speakers
0,english,626
1,spanish,216
2,chinese,193
3,arabic,170
4,mandarin,130


In [6]:
#For getting the speaker ids
def get_speaker_id(lang):
    speaker_id = []
    url = "http://accent.gmu.edu/browse_language.php?function=find&language=" + lang
    html = get(url)
    soup = BeautifulSoup(html.content, 'html.parser')
    for link in soup.find_all('a', href=re.compile("^browse_language")):
        href = link['href']
        url = urllib.parse.urlparse(href)
        params = urllib.parse.parse_qs(url.query)
        if 'speakerid' in params:
            speaker_id.append((params['speakerid'][0], "http://chnm.gmu.edu/accent/soundtracks/" + link.contents[0].replace(",", "") + ".mp3"))
    return pd.DataFrame(speaker_id, columns=["Speaker ID", "MP3 URL"]).reset_index(drop=True)

In [17]:
speaker_url = get_speaker_id('english')

In [26]:
speaker_url.head()

Unnamed: 0,Speaker ID,MP3 URL
0,61,http://chnm.gmu.edu/accent/soundtracks/english...
1,77,http://chnm.gmu.edu/accent/soundtracks/english...
2,88,http://chnm.gmu.edu/accent/soundtracks/english...
3,99,http://chnm.gmu.edu/accent/soundtracks/english...
4,110,http://chnm.gmu.edu/accent/soundtracks/english...


In [7]:
def get_mp3(url):
    urllib.request.urlretrieve(url, "./file_name.mp3")

In [24]:
x = speaker_url['MP3 URL'][1]

In [25]:
get_mp3(x)

In [30]:
speaker_ids = list(speaker_url['Speaker ID'].astype(int))

In [31]:
speaker_ids

[61,
 77,
 88,
 99,
 110,
 121,
 132,
 143,
 154,
 62,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 63,
 64,
 65,
 66,
 67,
 407,
 408,
 416,
 419,
 420,
 421,
 426,
 427,
 439,
 442,
 443,
 444,
 445,
 446,
 455,
 456,
 465,
 468,
 469,
 480,
 485,
 487,
 489,
 490,
 492,
 496,
 497,
 503,
 504,
 506,
 507,
 508,
 509,
 510,
 511,
 515,
 516,
 517,
 518,
 519,
 521,
 522,
 523,
 525,
 526,
 527,
 528,
 529,
 533,
 534,
 535,
 536,
 538,
 540,
 541,
 542,
 546,
 548,
 547,
 549,
 550,
 551,
 552,
 554,
 555,
 556,
 563,
 564,
 567,
 

In [35]:
def get_speaker_info():
    '''
    Inputs: two integers, corresponding to min and max speaker id number per language
    Outputs: Pandas Dataframe containing speaker filename, birthplace, native_language, age, sex, age_onset of English
    '''

    user_data = []
    for num in speaker_ids:
        info = {'speakerid': num, 'filename': 0, 'birthplace':1, 'native_language': 2, 'age':3, 'sex':4, 'age_onset':5}
        url = "http://accent.gmu.edu/browse_language.php?function=detail&speakerid={}".format(num)
        html = get(url)
        soup = BeautifulSoup(html.content, 'html.parser')
        body = soup.find_all('div', attrs={'class': 'content'})
        try:
            info['filename']=str(body[0].find('h5').text.split()[0])
            bio_bar = soup.find_all('ul', attrs={'class':'bio'})
            info['birthplace'] = str(bio_bar[0].find_all('li')[0].text)[13:-6]
            info['native_language'] = str(bio_bar[0].find_all('li')[1].text.split()[2])
            info['age'] = float(bio_bar[0].find_all('li')[3].text.split()[2].strip(','))
            info['sex'] = str(bio_bar[0].find_all('li')[3].text.split()[3].strip())
            info['age_onset'] = float(bio_bar[0].find_all('li')[4].text.split()[4].strip())
            user_data.append(info)
        except:
            info['filename'] = ''
            info['birthplace'] = ''
            info['native_language'] = ''
            info['age'] = ''
            info['sex'] = ''
            info['age_onset'] = ''
            user_data.append(info)
        df = pd.DataFrame(user_data)
#         df.to_csv('speaker_info_{}.csv'.format(stop))
    return df

In [36]:
speaker_info = get_speaker_info()

In [40]:
speaker_info

Unnamed: 0,age,age_onset,birthplace,filename,native_language,sex,speakerid
0,42.0,0.0,"pittsburgh, pennsylvania, usa",english1,english,male,61
1,30.0,0.0,"birmingham, uk",english2,english,female,77
2,26.0,0.0,"brisbane, australia",english3,english,female,88
3,53.0,0.0,"saint anne's bay, jamaica",english4,english,female,99
4,62.0,0.0,"fairfax, virginia, usa",english5,english,male,110
5,45.0,0.0,"brooklyn, new york, usa",english6,english,female,121
6,52.0,0.0,"macon, mississippi, usa",english7,english,male,132
7,21.0,0.0,"perth, australia",english8,english,female,143
8,48.0,0.0,"carthage, texas, usa",english9,english,female,154
9,35.0,0.0,"davenport, iowa, usa",english10,english,female,62
