# Get all European languages from Ethnologue

Input
- language codes: https://www.ethnologue.com/sites/default/files/LanguageCodes.tab
- country codes: https://www.ethnologue.com/sites/default/files/CountryCodes.tab
- description: https://www.ethnologue.com/codes/code-table-structure

Output
- LangID: ISO 639-3 standard for language identifiers
- CountryID: Main country where used. Two-letter code from ISO3166.
- LangStatus: L(iving), (e)X(tinct)
- Name_lang: Primary name in the country where mainly used
- Name_country: Name of the main country where used.
- Area: must be Europe

In [1]:
import pandas as pd
import os

In [2]:
DATA_FOLDER = "../../data"

In [3]:
countries = pd.read_csv(os.path.join(DATA_FOLDER, "general", "country_codes.tsv"), sep="\t")
display(countries)

Unnamed: 0,CountryID,Name,Area
0,AD,Andorra,Europe
1,AE,United Arab Emirates,Asia
2,AF,Afghanistan,Asia
3,AG,Antigua and Barbuda,Americas
4,AI,Anguilla,Americas
...,...,...,...
237,YE,Yemen,Asia
238,YT,Mayotte,Africa
239,ZA,South Africa,Africa
240,ZM,Zambia,Africa


In [4]:
europe = countries.loc[countries["Area"] == "Europe"]
display(europe)
europe.dtypes

Unnamed: 0,CountryID,Name,Area
0,AD,Andorra,Europe
5,AL,Albania,Europe
10,AT,Austria,Europe
13,AX,Aland Islands,Europe
15,BA,Bosnia and Herzegovina,Europe
18,BE,Belgium,Europe
20,BG,Bulgaria,Europe
33,BY,Belarus,Europe
40,CH,Switzerland,Europe
53,CZ,Czechia,Europe


CountryID    object
Name         object
Area         object
dtype: object

In [5]:
langs = pd.read_csv(os.path.join(DATA_FOLDER, "general", "lang_codes.tsv"), sep="\t")
display(langs)
langs.dtypes

Unnamed: 0,LangID,CountryID,LangStatus,Name
0,aaa,NG,L,Ghotuo
1,aab,NG,L,Alumu-Tesu
2,aac,PG,L,Ari
3,aad,PG,L,Amal
4,aae,IT,L,"Albanian, Arbëreshë"
...,...,...,...,...
7481,zyg,CN,L,"Zhuang, Yang"
7482,zyj,CN,L,"Zhuang, Youjiang"
7483,zyn,CN,L,"Zhuang, Yongnan"
7484,zyp,MM,L,"Chin, Zyphe"


LangID        object
CountryID     object
LangStatus    object
Name          object
dtype: object

In [6]:
eur_langs = langs.join(europe.set_index("CountryID"), on="CountryID", lsuffix='_lang', rsuffix='_country', how="inner")
display(eur_langs)

Unnamed: 0,LangID,CountryID,LangStatus,Name_lang,Name_country,Area
4,aae,IT,L,"Albanian, Arbëreshë",Italy,Europe
1230,cim,IT,L,Cimbrian,Italy,Europe
1730,egl,IT,L,Emilian,Italy,Europe
1912,fur,IT,L,Friulian,Italy,Europe
2511,ils,IT,L,International Sign,Italy,Europe
...,...,...,...,...,...,...
6605,vgt,BE,L,Flemish Sign Language,Belgium,Europe
6628,vls,BE,L,West Flemish,Belgium,Europe
6779,wln,BE,L,Walloon,Belgium,Europe
5682,slv,SI,L,Slovene,Slovenia,Europe


In [7]:
eur_langs.loc[eur_langs["LangStatus"] == "X"]

Unnamed: 0,LangID,CountryID,LangStatus,Name_lang,Name_country,Area
6901,xas,RU,X,Kamas,Russian Federation,Europe
7270,ysr,RU,X,"Yupik, Sirenik",Russian Federation,Europe
5526,sdt,FR,X,Shuadit,France,Europe
5364,rmd,DK,X,Traveller Danish,Denmark,Europe


In [8]:
eur_langs.to_csv(os.path.join(DATA_FOLDER, "general", "european_languages.csv"), index=False)