# preprocessing

In [1]:
import pandas as pd

### Lang_ID to Lang name map

In [2]:
mp = pd.read_csv('../data/language_names.csv')

In [3]:
mp.head(5)

Unnamed: 0,ID,Language_ID,Name,Provider
0,1,xoo,!Xóõ,ethnologue
1,2,arx,'Are'are,ethnologue
2,3,apk gan,A-Pucikwar,ethnologue
3,4,aar,Aari,ethnologue
4,5,aba,Abau,ethnologue


In [4]:
mp.drop(['ID', 'Provider'], axis = 1)
langs = {}
i = 0
for i in range(len(mp.index)):
    abbv, name = mp.iloc[i][1], mp.iloc[i][2]
    langs[abbv] = name

### langID to family

In [5]:
fam = pd.read_csv('../data/languagesMSD.csv')

In [6]:
fam = fam.drop(['NameOLD', 'NameNEW', 'MacroareaOLD', 'MacroareaNEW',
       'LatitudeOLD', 'LatitudeNEW', 'LongitudeOLD', 'LongitudeNEW',
       'GlottocodeOLD', 'GlottocodeNEW', 'ISO639P3code',
       'FamilyNEW', 'SubfamilyOLD', 'SubfamilyNEW', 'GenusOLD', 'GenusNEW',
       'GenusIcon', 'ISO_codesOLD', 'ISO_codesNEW', 'Samples_100',
       'Samples_200', 'Country_IDOLD', 'Country_IDNEW', 'Source'], axis = 1)

In [7]:
fam.columns = ['Language_ID', 'Familiy']

In [8]:
fams = {}
i = 0
for i in range(len(fam.index)):
    abbv, family = fam.iloc[i][0], fam.iloc[i][1]
    fams[abbv] = family

### Loading data set

In [9]:
df = pd.read_csv('../data/values.csv')

In [10]:
df['Name'] = df['Language_ID']
df['Family'] = df['Language_ID']

In [11]:
df['Name'] = df['Name'].map(langs)
df['Family'] = df['Family'].map(fams)

In [12]:
df = df.drop(['ID','Code_ID', 'Comment', 'Source', 'Example_ID' ],axis=1)

In [13]:
df.dropna()

Unnamed: 0,Language_ID,Parameter_ID,Value,Name,Family
22,aar,26A,2,Ari,Afro-Asiatic
23,aar,33A,9,Ari,Afro-Asiatic
24,aar,37A,3,Ari,Afro-Asiatic
25,aar,38A,4,Ari,Afro-Asiatic
26,aar,51A,1,Ari,Afro-Asiatic
...,...,...,...,...,...
76470,zzo,144B,3,"Zapotec, Zoogocho",Oto-Manguean
76471,zzo,144T,1,"Zapotec, Zoogocho",Oto-Manguean
76472,zzo,144V,1,"Zapotec, Zoogocho",Oto-Manguean
76473,zzo,144W,6,"Zapotec, Zoogocho",Oto-Manguean


### Preprocessing for U30

29A Syncretism in Verbal Person/Number Marking, 30A Number of Genders, 21B Exponence of Tense-Aspect-Mood Inflection, 69A Position of Tense-Aspect Affixes

In [14]:
A29 = df[df['Parameter_ID'] == '29A']
A30 = df[df['Parameter_ID'] == '30A']
B21 = df[df['Parameter_ID'] == '21B']
A69 = df[df['Parameter_ID'] == '69A']

In [15]:
A29.head(3)

Unnamed: 0,Language_ID,Parameter_ID,Value,Name,Family
100,abi,29A,2,Abipon,Guaicuruan
185,abk,29A,3,Abkhaz,Northwest Caucasian
728,aco,29A,3,Keres (Western),Keresan


29A:
- Value 1 = No subject person/number marking
- Value 2 = Syncretic
- Value 3 = Not syncretic

In [17]:
A29.Value = A29.Value.map({1:'No subject person/number marking', 2:'Syncretic', 3:'Not syncretic'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A29.Value = A29.Value.map({1:'No subject person/number marking', 2:'Syncretic', 3:'Not syncretic'})


In [18]:
A29.Value.value_counts()

Value
Not syncretic                       81
Syncretic                           60
No subject person/number marking    57
Name: count, dtype: int64

In [19]:
A29 = A29.dropna(axis=0)

In [20]:
A29.info()

<class 'pandas.core.frame.DataFrame'>
Index: 177 entries, 100 to 76222
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Language_ID   177 non-null    object
 1   Parameter_ID  177 non-null    object
 2   Value         177 non-null    object
 3   Name          177 non-null    object
 4   Family        177 non-null    object
dtypes: object(5)
memory usage: 8.3+ KB


In [21]:
import os
A29.to_csv('../Processed/29A.csv', index = False)

In [22]:
A30.head(3)

Unnamed: 0,Language_ID,Parameter_ID,Value,Name,Family
186,abk,30A,3,Abkhaz,Northwest Caucasian
330,abn,30A,1,Arabana,Pama-Nyungan
365,abo,30A,2,Arbore,Afro-Asiatic


30A:
- Value 1 = None
- Value 2 = Two
- Value 3 = Three
- Value 4 = Four
- Value 5 = Five or more


In [24]:
A30.Value = A30.Value.map({1:'None', 2:'Two', 3:'Three', 4:'Four', 5: 'Five or more'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A30.Value = A30.Value.map({1:'None', 2:'Two', 3:'Three', 4:'Four', 5: 'Five or more'})


In [25]:
A30.Value.value_counts()

Value
None            145
Two              50
Three            26
Five or more     24
Four             12
Name: count, dtype: int64

In [26]:
A30 = A30.dropna(axis=0)

In [27]:
A30.info()

<class 'pandas.core.frame.DataFrame'>
Index: 237 entries, 186 to 76223
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Language_ID   237 non-null    object
 1   Parameter_ID  237 non-null    object
 2   Value         237 non-null    object
 3   Name          237 non-null    object
 4   Family        237 non-null    object
dtypes: object(5)
memory usage: 11.1+ KB


In [28]:
import os
A30.to_csv('../Processed/30A.csv', index = False)

In [29]:
B21.head(3)

Unnamed: 0,Language_ID,Parameter_ID,Value,Name,Family
97,abi,21B,1,Abipon,Guaicuruan
176,abk,21B,1,Abkhaz,Northwest Caucasian
720,aco,21B,2,Keres (Western),Keresan


21B:
- Value 1 = monoexponential TAM
- Value 2 = TAM+agreement
- Value 3 = TAM+agreement+diathesis
- Value 4 = TAM+agreement+construct
- Value 5 = TAM+polarity
- Value 6 = no TAM

In [31]:
B21.Value = B21.Value.map({1:'monoexponential TAM', 2:'TAM+agreement', 3:'TAM+agreement+diathesis', 4:'TAM+agreement+construct', 5: 'TAM+polarity', 6: 'no TAM'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  B21.Value = B21.Value.map({1:'monoexponential TAM', 2:'TAM+agreement', 3:'TAM+agreement+diathesis', 4:'TAM+agreement+construct', 5: 'TAM+polarity', 6: 'no TAM'})


In [32]:
B21.Value.value_counts()

Value
monoexponential TAM        127
TAM+agreement               19
TAM+polarity                 5
TAM+agreement+diathesis      4
no TAM                       4
TAM+agreement+construct      1
Name: count, dtype: int64

In [33]:
B21 = B21.dropna(axis=0)

In [34]:
B21.info()

<class 'pandas.core.frame.DataFrame'>
Index: 141 entries, 97 to 76213
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Language_ID   141 non-null    object
 1   Parameter_ID  141 non-null    object
 2   Value         141 non-null    object
 3   Name          141 non-null    object
 4   Family        141 non-null    object
dtypes: object(5)
memory usage: 6.6+ KB


In [35]:
import os
B21.to_csv('../Processed/21B.csv', index = False)

In [36]:
A69.head(3)

Unnamed: 0,Language_ID,Parameter_ID,Value,Name,Family
28,aar,69A,2,Ari,Afro-Asiatic
115,abi,69A,2,Abipon,Guaicuruan
223,abk,69A,2,Abkhaz,Northwest Caucasian


69A:
- Value 1 = Tense-aspect prefixes
- Value 2 = Tense-aspect suffixes
- Value 3 = Tense-aspect tone
- Value 4 = Mixed type
- Value 5 = No tense-aspect inflection

In [38]:
A69.Value = A69.Value.map({1:'Tense-aspect prefixes', 2:'Tense-aspect suffixes', 3:'Tense-aspect tone', 4:'Mixed type', 5: 'No tense-aspect inflection'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A69.Value = A69.Value.map({1:'Tense-aspect prefixes', 2:'Tense-aspect suffixes', 3:'Tense-aspect tone', 4:'Mixed type', 5: 'No tense-aspect inflection'})


In [39]:
A69.Value.value_counts()

Value
Tense-aspect suffixes         667
Tense-aspect prefixes         153
No tense-aspect inflection    152
Mixed type                    146
Tense-aspect tone              13
Name: count, dtype: int64

In [40]:
A69 = A69.dropna(axis=0)

In [41]:
A69.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1054 entries, 28 to 76447
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Language_ID   1054 non-null   object
 1   Parameter_ID  1054 non-null   object
 2   Value         1054 non-null   object
 3   Name          1054 non-null   object
 4   Family        1054 non-null   object
dtypes: object(5)
memory usage: 49.4+ KB


In [42]:
import os
A69.to_csv('../Processed/69A.csv', index = False)