# Extraktion der Sprachkürzel
Dieses Notebook extrahiert aus dem [IANA-Register](https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry) die Sprachkürzel für jede Sprache und speichert sie in einem CSV.


In [1]:
from bs4 import BeautifulSoup
import requests
import pandas

In [2]:
response = requests.get("https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry")

content = response.text

content

'File-Date: 2021-02-23\n%%\nType: language\nSubtag: aa\nDescription: Afar\nAdded: 2005-10-16\n%%\nType: language\nSubtag: ab\nDescription: Abkhazian\nAdded: 2005-10-16\nSuppress-Script: Cyrl\n%%\nType: language\nSubtag: ae\nDescription: Avestan\nAdded: 2005-10-16\n%%\nType: language\nSubtag: af\nDescription: Afrikaans\nAdded: 2005-10-16\nSuppress-Script: Latn\n%%\nType: language\nSubtag: ak\nDescription: Akan\nAdded: 2005-10-16\nScope: macrolanguage\n%%\nType: language\nSubtag: am\nDescription: Amharic\nAdded: 2005-10-16\nSuppress-Script: Ethi\n%%\nType: language\nSubtag: an\nDescription: Aragonese\nAdded: 2005-10-16\n%%\nType: language\nSubtag: ar\nDescription: Arabic\nAdded: 2005-10-16\nSuppress-Script: Arab\nScope: macrolanguage\n%%\nType: language\nSubtag: as\nDescription: Assamese\nAdded: 2005-10-16\nSuppress-Script: Beng\n%%\nType: language\nSubtag: av\nDescription: Avaric\nAdded: 2005-10-16\n%%\nType: language\nSubtag: ay\nDescription: Aymara\nAdded: 2005-10-16\nSuppress-Script:

In [3]:
entries = content.split("\n%%\n")

del entries[0]

entries

['Type: language\nSubtag: aa\nDescription: Afar\nAdded: 2005-10-16',
 'Type: language\nSubtag: ab\nDescription: Abkhazian\nAdded: 2005-10-16\nSuppress-Script: Cyrl',
 'Type: language\nSubtag: ae\nDescription: Avestan\nAdded: 2005-10-16',
 'Type: language\nSubtag: af\nDescription: Afrikaans\nAdded: 2005-10-16\nSuppress-Script: Latn',
 'Type: language\nSubtag: ak\nDescription: Akan\nAdded: 2005-10-16\nScope: macrolanguage',
 'Type: language\nSubtag: am\nDescription: Amharic\nAdded: 2005-10-16\nSuppress-Script: Ethi',
 'Type: language\nSubtag: an\nDescription: Aragonese\nAdded: 2005-10-16',
 'Type: language\nSubtag: ar\nDescription: Arabic\nAdded: 2005-10-16\nSuppress-Script: Arab\nScope: macrolanguage',
 'Type: language\nSubtag: as\nDescription: Assamese\nAdded: 2005-10-16\nSuppress-Script: Beng',
 'Type: language\nSubtag: av\nDescription: Avaric\nAdded: 2005-10-16',
 'Type: language\nSubtag: ay\nDescription: Aymara\nAdded: 2005-10-16\nSuppress-Script: Latn\nScope: macrolanguage',
 'Type

In [4]:
data = []

for entry in entries:
    dataSet = {
        "Description": []
    }
    
    pairs = entry.split("\n")
    
    for pair in pairs:
        if ": " not in pair:
            continue
            
        keyValues = pair.split(": ")
        
        if keyValues[0] == "Description":
            dataSet["Description"].append(keyValues[1])
        else:
            dataSet[keyValues[0]] = keyValues[1]
    
    data.append(dataSet)

data

[{'Added': '2005-10-16',
  'Description': ['Afar'],
  'Subtag': 'aa',
  'Type': 'language'},
 {'Added': '2005-10-16',
  'Description': ['Abkhazian'],
  'Subtag': 'ab',
  'Suppress-Script': 'Cyrl',
  'Type': 'language'},
 {'Added': '2005-10-16',
  'Description': ['Avestan'],
  'Subtag': 'ae',
  'Type': 'language'},
 {'Added': '2005-10-16',
  'Description': ['Afrikaans'],
  'Subtag': 'af',
  'Suppress-Script': 'Latn',
  'Type': 'language'},
 {'Added': '2005-10-16',
  'Description': ['Akan'],
  'Scope': 'macrolanguage',
  'Subtag': 'ak',
  'Type': 'language'},
 {'Added': '2005-10-16',
  'Description': ['Amharic'],
  'Subtag': 'am',
  'Suppress-Script': 'Ethi',
  'Type': 'language'},
 {'Added': '2005-10-16',
  'Description': ['Aragonese'],
  'Subtag': 'an',
  'Type': 'language'},
 {'Added': '2005-10-16',
  'Description': ['Arabic'],
  'Scope': 'macrolanguage',
  'Subtag': 'ar',
  'Suppress-Script': 'Arab',
  'Type': 'language'},
 {'Added': '2005-10-16',
  'Description': ['Assamese'],
  'Su

In [5]:
dataframe = pandas.DataFrame(data)

columns = [
    "Added", 
    "Comments", 
    "Deprecated", 
    "Description", 
    "Macrolanguage", 
    "Preferred-Value",
    "Prefix",
    "Scope",
    "Subtag",
    "Suppress-Script",
    "Tag",
    "Type"
]

dataframe = dataframe = dataframe.loc[:,columns]

dataframe.head()

Unnamed: 0,Added,Comments,Deprecated,Description,Macrolanguage,Preferred-Value,Prefix,Scope,Subtag,Suppress-Script,Tag,Type
0,2005-10-16,,,[Afar],,,,,aa,,,language
1,2005-10-16,,,[Abkhazian],,,,,ab,Cyrl,,language
2,2005-10-16,,,[Avestan],,,,,ae,,,language
3,2005-10-16,,,[Afrikaans],,,,,af,Latn,,language
4,2005-10-16,,,[Akan],,,,macrolanguage,ak,,,language


In [6]:
dataframe = dataframe.loc[dataframe.Scope != "collection"]

dataframe.shape

(9052, 12)

In [7]:
dataframe = dataframe.loc[dataframe.Type == "language"]

dataframe.shape

(8097, 12)

In [8]:
dataframe = dataframe.loc[dataframe["Suppress-Script"].notnull()]

dataframe.shape

(134, 12)

In [9]:
dataframe = dataframe.loc[:,["Subtag", "Description"]]

dataframe.head()

Unnamed: 0,Subtag,Description
1,ab,[Abkhazian]
3,af,[Afrikaans]
5,am,[Amharic]
7,ar,[Arabic]
8,as,[Assamese]


In [10]:
def countDescriptions(row):
    row["Counts"] = len(row["Description"])
    return row
    
conflict = dataframe.apply(countDescriptions, axis=1).copy()

conflict.head()

Unnamed: 0,Subtag,Description,Counts
1,ab,[Abkhazian],1
3,af,[Afrikaans],1
5,am,[Amharic],1
7,ar,[Arabic],1
8,as,[Assamese],1


In [11]:
conflict = conflict.loc[conflict.Counts > 1]

conflict

Unnamed: 0,Subtag,Description,Counts
18,bn,"[Bengali, Bangla]",2
22,ca,"[Catalan, Valencian]",2
33,dv,"[Dhivehi, Divehi, Maldivian]",3
39,es,"[Spanish, Castilian]",2
60,ht,"[Haitian, Haitian Creole]",2
85,kl,"[Kalaallisut, Greenlandic]",2
86,km,"[Khmer, Central Khmer]",2
96,lb,"[Luxembourgish, Letzeburgesch]",2
110,mo,"[Moldavian, Moldovan]",2
120,nl,"[Dutch, Flemish]",2


In [12]:
index = [0, 0, 2, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0]

conflict["Index"] = index

conflict

Unnamed: 0,Subtag,Description,Counts,Index
18,bn,"[Bengali, Bangla]",2,0
22,ca,"[Catalan, Valencian]",2,0
33,dv,"[Dhivehi, Divehi, Maldivian]",3,2
39,es,"[Spanish, Castilian]",2,0
60,ht,"[Haitian, Haitian Creole]",2,0
85,kl,"[Kalaallisut, Greenlandic]",2,1
86,km,"[Khmer, Central Khmer]",2,0
96,lb,"[Luxembourgish, Letzeburgesch]",2,0
110,mo,"[Moldavian, Moldovan]",2,0
120,nl,"[Dutch, Flemish]",2,0


In [13]:
def solveConflict(row):
    description = row["Description"]
    row["Description"] = [description[row["Index"]]]
    return row

solved = conflict.apply(solveConflict, axis=1).copy()

solved = solved.loc[:,["Subtag", "Description"]]

solved

Unnamed: 0,Subtag,Description
18,bn,[Bengali]
22,ca,[Catalan]
33,dv,[Maldivian]
39,es,[Spanish]
60,ht,[Haitian]
85,kl,[Greenlandic]
86,km,[Khmer]
96,lb,[Luxembourgish]
110,mo,[Moldavian]
120,nl,[Dutch]


In [14]:
def unwrap(value):
    return value[0]
    

dataframe.loc[solved.index,:] = solved

dataframe["Description"] = dataframe["Description"].apply(unwrap)

dataframe.head()

Unnamed: 0,Subtag,Description
1,ab,Abkhazian
3,af,Afrikaans
5,am,Amharic
7,ar,Arabic
8,as,Assamese


In [15]:
def removeBrackets(value):
    description = value
    
    if " (" in value:
        description = value.split(" (")[0]
    
    return description
        
dataframe["Description"] = dataframe["Description"].apply(removeBrackets)

dataframe

Unnamed: 0,Subtag,Description
1,ab,Abkhazian
3,af,Afrikaans
5,am,Amharic
7,ar,Arabic
8,as,Assamese
10,ay,Aymara
13,be,Belarusian
14,bg,Bulgarian
18,bn,Bengali
21,bs,Bosnian


In [16]:
headers = {
    "Subtag": "code",
    "Description": "name"
}

dataframe = dataframe.rename(columns=headers)

dataframe.head()

Unnamed: 0,code,name
1,ab,Abkhazian
3,af,Afrikaans
5,am,Amharic
7,ar,Arabic
8,as,Assamese


In [17]:
dataframe.to_csv("C:\\temp\\languages.csv", index=False)