In [6]:
import pandas as pd

data = pd.ExcelFile('./data_sets/Endangered_Languages.xlsx')
endangered_languages = data.parse('Extended_Dataset')
necessary_data = ['Name in English', 'Countries', 'Country codes alpha 3', 'Degree of endangerment',
                  'Number of speakers', 'Latitude', 'Longitude']
endangered_languages = endangered_languages[necessary_data]

""" 
    The column 'Country codes alpha 3' are string values separated by commas. 
    I want to access just the main country that langague is spoken, so converting the string into
    a list and then returning just the first country
"""
endangered_languages['Country codes alpha 3'] = endangered_languages['Country codes alpha 3'].fillna("None")
endangered_languages['Country codes alpha 3'] = [x.split(',')[0] for x in endangered_languages['Country codes alpha 3']]


""" Locating the one row in 'Country codes alpha 3' that has no country code """
noneLook = endangered_languages['Country codes alpha 3'] == "None"

""" 
    After some research using the coordinates, was able to identify the country. 
    https://en.wikipedia.org/wiki/Shinasha_language 
    Adding that information to the 'Country codes alpha 3' and 'Countries' columns
"""
endangered_languages.iloc[405, 1] = 'Ethiopia'
endangered_languages.iloc[405, 2] = 'ETH'

endangered_languages.iloc[405, :]

Name in English                            Boro
Countries                              Ethiopia
Country codes alpha 3                       ETH
Degree of endangerment    Definitely endangered
Number of speakers                        19878
Latitude                                10.3581
Longitude                               35.0024
Name: 405, dtype: object

In [7]:
endangered_languages.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2722 entries, 0 to 2721
Data columns (total 7 columns):
Name in English           2722 non-null object
Countries                 2722 non-null object
Country codes alpha 3     2722 non-null object
Degree of endangerment    2722 non-null object
Number of speakers        2539 non-null float64
Latitude                  2719 non-null float64
Longitude                 2719 non-null float64
dtypes: float64(3), object(4)
memory usage: 148.9+ KB


In [10]:
endangered_languages['Degree of endangerment'] = endangered_languages['Degree of endangerment'].astype('category')

In [11]:
endangered_languages.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2722 entries, 0 to 2721
Data columns (total 7 columns):
Name in English           2722 non-null object
Countries                 2722 non-null object
Country codes alpha 3     2722 non-null object
Degree of endangerment    2722 non-null category
Number of speakers        2539 non-null float64
Latitude                  2719 non-null float64
Longitude                 2719 non-null float64
dtypes: category(1), float64(3), object(3)
memory usage: 130.5+ KB


In [12]:
missing_speakers = endangered_languages['Number of speakers'].isnull()
endangered_languages[missing_speakers]

Unnamed: 0,Name in English,Countries,Country codes alpha 3,Degree of endangerment,Number of speakers,Latitude,Longitude
2539,A'tong,India,IND,Severely endangered,,25.2844,91.1755
2540,Aché,Paraguay,PRY,Definitely endangered,,-25.6613,-55.6787
2541,Akie,United Republic of Tanzania,TZA,Critically endangered,,-4.7735,37.3315
2542,Akuriyo,Suriname,SUR,Critically endangered,,2.8442,-55.7556
2543,Alemannic,"Germany, Austria, France, Italy, Liechtenstein...",DEU,Vulnerable,,47.2792,7.6904
2544,Amami,Japan,JPN,Definitely endangered,,28.3672,129.4958
2545,Amok,"China, Lao People's Democratic Republic, Myanm...",CHN,Critically endangered,,21.3610,100.6457
2546,Angku,Myanmar,MMR,Critically endangered,,21.4888,100.8325
2547,Auvergnat,France,FRA,Severely endangered,,45.1278,3.1311
2548,Balti,"India, Pakistan",IND,Vulnerable,,35.0659,76.1901


In [13]:
missing_lat = endangered_languages['Latitude'].isnull()
endangered_languages[missing_lat]

Unnamed: 0,Name in English,Countries,Country codes alpha 3,Degree of endangerment,Number of speakers,Latitude,Longitude
742,Romani (Colombia),Colombia,COL,Vulnerable,4858.0,,
2200,Umbrul,Vanuatu,VUT,Critically endangered,5.0,,
2335,Dorasque,Panama,PAN,Extinct,0.0,,


In [14]:
col_dict = {'Name in English': 'Language', 'Countries': 'Countries Where Spoken', 'Country codes alpha 3': 'Country Code',
           'Degree of endangerment': 'Degree of Endangerment', 'Number of speakers': 'Speakers', 'Latitude': 'Latitude',
           'Longitude': 'Longitude'}

endangered_languages.columns = [col_dict.get(x, x) for x in endangered_languages.columns]
endangered_languages.head()

Unnamed: 0,Language,Countries Where Spoken,Country Code,Degree of Endangerment,Speakers,Latitude,Longitude
0,South Italian,Italy,ITA,Vulnerable,7500000.0,40.9798,15.249
1,Sicilian,Italy,ITA,Vulnerable,5000000.0,37.4399,14.5019
2,Low Saxon,"Germany, Denmark, Netherlands, Poland, Russian...",DEU,Vulnerable,4800000.0,53.4029,10.3601
3,Belarusian,"Belarus, Latvia, Lithuania, Poland, Russian Fe...",BRB,Vulnerable,4000000.0,53.956,27.5756
4,Lombard,"Italy, Switzerland",ITA,Definitely endangered,3500000.0,45.7215,9.3273


In [15]:
endangered_languages.to_csv('./data_sets/Endangered_Languages_Clean.csv')