# preprocessing

In [14]:
import pandas as pd

### Lang_ID to Lang name map

In [15]:
mp = pd.read_csv('../data/language_names.csv')

In [16]:
mp.head(5)

Unnamed: 0,ID,Language_ID,Name,Provider
0,1,xoo,!Xóõ,ethnologue
1,2,arx,'Are'are,ethnologue
2,3,apk gan,A-Pucikwar,ethnologue
3,4,aar,Aari,ethnologue
4,5,aba,Abau,ethnologue


In [17]:
mp.drop(['ID', 'Provider'], axis = 1)
langs = {}
i = 0
for i in range(len(mp.index)):
    abbv, name = mp.iloc[i][1], mp.iloc[i][2]
    langs[abbv] = name

### langID to family

In [18]:
fam = pd.read_csv('../data/languagesMSD.csv')

In [19]:
fam = fam.drop(['NameOLD', 'NameNEW', 'MacroareaOLD', 'MacroareaNEW',
       'LatitudeOLD', 'LatitudeNEW', 'LongitudeOLD', 'LongitudeNEW',
       'GlottocodeOLD', 'GlottocodeNEW', 'ISO639P3code',
       'FamilyNEW', 'SubfamilyOLD', 'SubfamilyNEW', 'GenusOLD', 'GenusNEW',
       'GenusIcon', 'ISO_codesOLD', 'ISO_codesNEW', 'Samples_100',
       'Samples_200', 'Country_IDOLD', 'Country_IDNEW', 'Source'], axis = 1)

In [20]:
fam.columns = ['Language_ID', 'Familiy']

In [21]:
fams = {}
i = 0
for i in range(len(fam.index)):
    abbv, family = fam.iloc[i][0], fam.iloc[i][1]
    fams[abbv] = family

### Loading data set

In [22]:
df = pd.read_csv('../data/values.csv')

In [23]:
df['Name'] = df['Language_ID']
df['Family'] = df['Language_ID']

In [24]:
df['Name'] = df['Name'].map(langs)
df['Family'] = df['Family'].map(fams)

In [25]:
df = df.drop(['ID','Code_ID', 'Comment', 'Source', 'Example_ID' ],axis=1)

In [26]:
df.dropna()

Unnamed: 0,Language_ID,Parameter_ID,Value,Name,Family
22,aar,26A,2,Ari,Afro-Asiatic
23,aar,33A,9,Ari,Afro-Asiatic
24,aar,37A,3,Ari,Afro-Asiatic
25,aar,38A,4,Ari,Afro-Asiatic
26,aar,51A,1,Ari,Afro-Asiatic
...,...,...,...,...,...
76470,zzo,144B,3,"Zapotec, Zoogocho",Oto-Manguean
76471,zzo,144T,1,"Zapotec, Zoogocho",Oto-Manguean
76472,zzo,144V,1,"Zapotec, Zoogocho",Oto-Manguean
76473,zzo,144W,6,"Zapotec, Zoogocho",Oto-Manguean


### preprocessing for U2 (see new notebook "Preprocessing for U2")

86A Order of Genitive and Noun
85A Order of Adposition and Noun Phrase

In [28]:
A86 = df[df['Parameter_ID'] == '86A']
A85 = df[df['Parameter_ID'] == '85A']

In [29]:
A86.head(10)

Unnamed: 0,Language_ID,Parameter_ID,Value,Name,Family
32,aar,86A,1,Ari,Afro-Asiatic
55,aba,86A,1,Abau,Sepik
123,abi,86A,3,Abipon,Guaicuruan
241,abk,86A,1,Abkhaz,Northwest Caucasian
349,abn,86A,3,Arabana,Pama-Nyungan
372,abo,86A,2,Arbore,Afro-Asiatic
400,abu,86A,1,Yimbun,West Papuan
450,abv,86A,1,Abui,Timor-Alor-Pantar
515,ace,86A,2,Achinese,Austronesian
570,acg,86A,1,Achagua,Arawakan


#### 86A:
- Value 1 = Genitive-Noun
- Value 2 = Noun-Genitive
- value 3 = No dominant order

In [30]:
A86.Value = A86.Value.map({1:'Genetive-Noun', 2:'Noun_Genitive', 3:'No dominant genitive noun order'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A86.Value = A86.Value.map({1:'Genetive-Noun', 2:'Noun_Genitive', 3:'No dominant genitive noun order'})


In [31]:
A86.Value.value_counts()

Value
Genetive-Noun                      685
Noun_Genitive                      468
No dominant genitive noun order     96
Name: count, dtype: int64

In [32]:
A86 = A86.dropna(axis=0)

In [33]:
A86.Value.value_counts()

Value
Genetive-Noun                      627
Noun_Genitive                      444
No dominant genitive noun order     89
Name: count, dtype: int64

In [34]:
A86.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1160 entries, 32 to 76434
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Language_ID   1160 non-null   object
 1   Parameter_ID  1160 non-null   object
 2   Value         1160 non-null   object
 3   Name          1160 non-null   object
 4   Family        1160 non-null   object
dtypes: object(5)
memory usage: 54.4+ KB


In [35]:
A86.Parameter_ID = 'A86'


In [36]:
import os
A86.to_csv('../Processed/86A.csv', index = False)

#### A85


value 1 = postpositions
value 2 = prepositions
value 3 = inpositions
value 4 = no dominant order
value 5 = no adpositions

In [37]:
A85.head(10)

Unnamed: 0,Language_ID,Parameter_ID,Value,Name,Family
31,aar,85A,1,Ari,Afro-Asiatic
54,aba,85A,1,Abau,Sepik
122,abi,85A,2,Abipon,Guaicuruan
240,abk,85A,1,Abkhaz,Northwest Caucasian
399,abu,85A,2,Yimbun,West Papuan
449,abv,85A,5,Abui,Timor-Alor-Pantar
514,ace,85A,2,Achinese,Austronesian
569,acg,85A,1,Achagua,Arawakan
620,acl,85A,2,Acholi,Eastern Sudanic
665,acm,85A,2,Achumawi,Hokan


In [38]:
A85.Value.value_counts()

Value
1    577
2    511
4     58
5     30
3      8
Name: count, dtype: int64

In [39]:
A85.Value = A85.Value.map({1:'Postpositions', 2:'Prepositions', 3:'Inpositions', 4: 'No dominant adposition order', 5:'No adpositions'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A85.Value = A85.Value.map({1:'Postpositions', 2:'Prepositions', 3:'Inpositions', 4: 'No dominant adposition order', 5:'No adpositions'})


In [40]:
A85.Value.value_counts()

Value
Postpositions                   577
Prepositions                    511
No dominant adposition order     58
No adpositions                   30
Inpositions                       8
Name: count, dtype: int64

In [41]:
A85 = A85.dropna(axis=0)

In [42]:
A85.Parameter_ID = 'A85'

In [43]:
A85.to_csv('../Processed/85A.csv', index = False)

### pre processing for U3

### pre processing for U4