# Data Preparation


In [1]:
# --------------------------------------------------------------------------------------------------------
# ciaFactBook.ipynb -- data preparation, inspection, and visualization 
# --------------------------------------------------------------------------------------------------------
# 0.01|03.01.22|AB|creation
# 0.02|16.01.22|AB|filter Europe, cleanse data, save as csv
# 0.03| WIP
# --------------------------------------------------------------------------------------------------------

# initialize

import numpy as np
import pandas as pd
#import glob
import os
import matplotlib.pyplot as plt

#from datetime import datetime, date 
import re # regular expression
import seaborn as sns

# %matplotlib inline
# %matplotlib notebook

PATH_DATA = r'../../data/'
INTERACTIVE = False
VALUE_COLNAME = 'language'  # name for new column containing value

if INTERACTIVE:
    %matplotlib notebook
else:
    %matplotlib inline 

## Languages per country (official, minority)

_The number of persons having their usual residence in a country on 1 January of the respective year. When usually resident population is not available, countries may report legal or registered residents._

https://www.cia.gov/the-world-factbook/about/archives/


In [2]:
fileName = PATH_DATA + 'countries_languages_all.csv'
# header = 1st line
data = pd.read_csv(fileName, sep=',', header=0) 

In [3]:
data

Unnamed: 0,Country,Recognized Languages
0,Afghanistan,Afghan Persian or Dari (official) 77% (Dari fu...
1,Albania,Albanian 98.8% (official - derived from Tosk d...
2,Algeria,"Arabic (official), French (lingua franca), Ber..."
3,Andorra,"Catalan (official), French, Castilian, Portuguese"
4,Angola,"Portuguese 71.2% (official), Umbundu 23%, Kiko..."
...,...,...
209,Vietnam,"Vietnamese (official), English (increasingly f..."
210,Western Sahara (proposed state),"Standard Arabic, Hassaniya Arabic, Moroccan Ar..."
211,Yemen,Arabic (official)
212,Zambia,"Bemba 33.4%, Nyanja 14.7%, Tonga 11.4%, Lozi 5..."


Non-European countries to be  filtered out.
Second (unstructured) column is transformed as follows:
- Split text into 1 column per language
- add one column for share / official (yes/no)

In [4]:
# data['Recognized Languages'].str.extract([A-Z][a-z]+ )

# https://stackoverflow.com/questions/14745022/how-to-split-a-dataframe-string-column-into-two-columns/21296915#21296915
data = data.join(data['Recognized Languages'].str.split(',', expand=True))

In [5]:
# drop split column
data.pop('Recognized Languages')
data

Unnamed: 0,Country,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,Afghanistan,Afghan Persian or Dari (official) 77% (Dari fu...,Pashto (official) 48%,Uzbek 11%,English 6%,Turkmen 3%,Urdu 3%,Pashayi 1%,Nuristani 1%,Arabic 1%,Balochi 1% (2017 est.),,,,,,,,,
1,Albania,Albanian 98.8% (official - derived from Tosk d...,Greek 0.5%,other 0.6% (including Macedonian,Romani,Vlach,Turkish,Italian,and Serbo-Croatian),unspecified 0.1% (2011 est.),,,,,,,,,,
2,Algeria,Arabic (official),French (lingua franca),Berber or Tamazight (official); dialects incl...,Shawiya Berber (Tacawit),Mzab Berber,Tuareg Berber (Tamahaq),,,,,,,,,,,,,
3,Andorra,Catalan (official),French,Castilian,Portuguese,,,,,,,,,,,,,,,
4,Angola,Portuguese 71.2% (official),Umbundu 23%,Kikongo 8.2%,Kimbundu 7.8%,Chokwe 6.5%,Nhaneca 3.4%,Nganguela 3.1%,Fiote 2.4%,Kwanhama 2.3%,Muhumbi 2.1%,Luvale 1%,other 3.6% (2014 est.),,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209,Vietnam,Vietnamese (official),English (increasingly favored as a second lan...,some French,Chinese,and Khmer,mountain area languages (Mon-Khmer and Malayo...,,,,,,,,,,,,,
210,Western Sahara (proposed state),Standard Arabic,Hassaniya Arabic,Moroccan Arabic,Berber,Spanish,French,,,,,,,,,,,,,
211,Yemen,Arabic (official),,,,,,,,,,,,,,,,,,
212,Zambia,Bemba 33.4%,Nyanja 14.7%,Tonga 11.4%,Lozi 5.5%,Chewa 4.5%,Nsenga 2.9%,Tumbuka 2.5%,Lunda (North Western) 1.9%,Kaonde 1.8%,Lala 1.8%,Lamba 1.8%,English (official) 1.7%,Luvale 1.5%,Mambwe 1.3%,Namwanga 1.2%,Lenje 1.1%,Bisa 1%,other 9.7%,unspecified 0.2% (2010 est.)


In [6]:
# create a row for each country-language combination (pivot wide -> long)
# key is country (for each language and country, a row is to be created)
#data = 
data = data.melt(['Country'], value_name = 'languageRaw', var_name = 'langId')
#, var_name = 'langId', value_vars = None, value_name = None)   #, value_name = VALUE_COLNAME)
data

Unnamed: 0,Country,langId,languageRaw
0,Afghanistan,0,Afghan Persian or Dari (official) 77% (Dari fu...
1,Albania,0,Albanian 98.8% (official - derived from Tosk d...
2,Algeria,0,Arabic (official)
3,Andorra,0,Catalan (official)
4,Angola,0,Portuguese 71.2% (official)
...,...,...,...
4061,Vietnam,18,
4062,Western Sahara (proposed state),18,
4063,Yemen,18,
4064,Zambia,18,unspecified 0.2% (2010 est.)


In [7]:
# remove empty languages
data = data.dropna()
data
# note: the number of languages spoken in a given country is equal to max(langId)+1; 
# most languages are spoken in Zambia (18)

Unnamed: 0,Country,langId,languageRaw
0,Afghanistan,0,Afghan Persian or Dari (official) 77% (Dari fu...
1,Albania,0,Albanian 98.8% (official - derived from Tosk d...
2,Algeria,0,Arabic (official)
3,Andorra,0,Catalan (official)
4,Angola,0,Portuguese 71.2% (official)
...,...,...,...
3572,Panama,16,Japanese)
3636,Zambia,16,Bisa 1%
3637,Zimbabwe,16,and Xhosa)
3850,Zambia,17,other 9.7%


In [8]:
# European languages only
# join with european_languages.csv
eur_lang = pd.read_csv("../../data/general/european_languages.csv")
eur_lang

Unnamed: 0,LangID,CountryID,LangStatus,Name_lang,Name_country,Area
0,aae,IT,L,"Albanian, Arbëreshë",Italy,Europe
1,cim,IT,L,Cimbrian,Italy,Europe
2,egl,IT,L,Emilian,Italy,Europe
3,fur,IT,L,Friulian,Italy,Europe
4,ils,IT,L,International Sign,Italy,Europe
...,...,...,...,...,...,...
288,vgt,BE,L,Flemish Sign Language,Belgium,Europe
289,vls,BE,L,West Flemish,Belgium,Europe
290,wln,BE,L,Walloon,Belgium,Europe
291,slv,SI,L,Slovene,Slovenia,Europe


In [9]:
# semi join with European languages
data = data[data.Country.isin(eur_lang.Name_country)]
data

Unnamed: 0,Country,langId,languageRaw
1,Albania,0,Albanian 98.8% (official - derived from Tosk d...
9,Austria,0,German (official nationwide) 88.6%
15,Belarus,0,Russian (official) 70.2%
16,Belgium,0,Dutch (official) 60%
21,Bosnia and Herzegovina,0,Bosnian (official) 52.9%
...,...,...,...
1821,Luxembourg,8,other 8.4% (2011 est.)
1889,Spain,8,000 speakers)
1894,Switzerland,8,Romansh (official) 0.5%
1989,France,9,Picard)


In [10]:
# -- create an index
data.set_index(['Country', 'langId'], inplace = True)

In [11]:
# mark official languages (new column official [Boolean])
pattern = "official"  
# data[data['languageRaw'].str.contains(pattern)]
data['official'] = data['languageRaw'].str.contains(pattern)
data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['official'] = data['languageRaw'].str.contains(pattern)


Unnamed: 0_level_0,Unnamed: 1_level_0,languageRaw,official
Country,langId,Unnamed: 2_level_1,Unnamed: 3_level_1
Albania,0,Albanian 98.8% (official - derived from Tosk d...,True
Austria,0,German (official nationwide) 88.6%,True
Belarus,0,Russian (official) 70.2%,True
Belgium,0,Dutch (official) 60%,True
Bosnia and Herzegovina,0,Bosnian (official) 52.9%,True
...,...,...,...
Luxembourg,8,other 8.4% (2011 est.),False
Spain,8,000 speakers),False
Switzerland,8,Romansh (official) 0.5%,True
France,9,Picard),False


In [12]:
# Extract actual language name, identified as the first capitalized word (this heuristic introduces some NA's)
pattern = r'(?P<language>[A-Z][a-z]+)'  # named group becomes new column 'language'
data['languageRaw'].str.extract(pattern)
# data = data[data[colName].str.fullmatch(pattern)]

Unnamed: 0_level_0,Unnamed: 1_level_0,language
Country,langId,Unnamed: 2_level_1
Albania,0,Albanian
Austria,0,German
Belarus,0,Russian
Belgium,0,Dutch
Bosnia and Herzegovina,0,Bosnian
...,...,...
Luxembourg,8,
Spain,8,
Switzerland,8,Romansh
France,9,Picard


In [13]:
data = data.join(data['languageRaw'].str.extract(pattern))
data = data.dropna()

In [14]:
data.sort_values(by = ["Country", "langId"], inplace=True)

In [15]:
# data cleansing

data.drop(("Austria", 5), axis=0, inplace = True)
data.drop(("Ireland", 3), axis=0, inplace = True)
data.drop(("Ireland", 4), axis=0, inplace = True)
data.loc["United Kingdom", "official"] = True
data.loc[("Albania", 7), "language"] = "Croatian"
data

Unnamed: 0_level_0,Unnamed: 1_level_0,languageRaw,official,language
Country,langId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Albania,0,Albanian 98.8% (official - derived from Tosk d...,True,Albanian
Albania,1,Greek 0.5%,False,Greek
Albania,2,other 0.6% (including Macedonian,False,Macedonian
Albania,3,Romani,False,Romani
Albania,4,Vlach,False,Vlach
...,...,...,...,...
Ukraine,1,Russian (regional language) 29.6%,False,Russian
Ukraine,2,other (includes small Crimean Tatar-,False,Crimean
Ukraine,3,Moldovan/Romanian-,False,Moldovan
Ukraine,4,and Hungarian-speaking minorities) 2.9% (2001...,False,Hungarian


In [16]:
# save processed data
data.to_csv(os.path.join(PATH_DATA, "general", "countries_languages_eur.csv"), index=True)