# preprocessing

In [27]:
import pandas as pd

### Mapping


In [28]:
fam = pd.read_csv('../data/languagesMSD.csv')

fam = fam.drop(['NameOLD', 'MacroareaOLD', 'MacroareaNEW',
       'LatitudeOLD', 'LatitudeNEW', 'LongitudeOLD', 'LongitudeNEW',
       'GlottocodeOLD', 'GlottocodeNEW', 'ISO639P3code',
       'FamilyNEW', 'SubfamilyOLD', 'SubfamilyNEW', 'GenusOLD', 'GenusNEW',
       'GenusIcon', 'ISO_codesOLD', 'ISO_codesNEW', 'Samples_100',
       'Samples_200', 'Country_IDOLD', 'Country_IDNEW', 'Source'], axis = 1)
fam.columns = ['Language_ID','Name', 'Familiy']

fams = {}
names = {}
i = 0
for i in range(len(fam.index)):
    abbv, family, name = fam.iloc[i][0], fam.iloc[i][2], fam.iloc[i][1]
    fams[abbv] = family
    names[abbv] = name

  abbv, family, name = fam.iloc[i][0], fam.iloc[i][2], fam.iloc[i][1]


### Loading data set

In [29]:
df = pd.read_csv('../data/values.csv')

In [30]:
df['Name'] = df['Language_ID']
df['Family'] = df['Language_ID']

In [31]:
df['Name'] = df['Name'].map(names)
df['Family'] = df['Family'].map(fams)

In [32]:
df = df.drop(['ID','Code_ID', 'Comment', 'Source', 'Example_ID' ],axis=1)

In [33]:
df.dropna()

Unnamed: 0,Language_ID,Parameter_ID,Value,Name,Family
0,aab,81A,2,Arapesh (Abu),Torricelli
1,aab,82A,1,Arapesh (Abu),Torricelli
2,aab,83A,2,Arapesh (Abu),Torricelli
3,aab,87A,2,Arapesh (Abu),Torricelli
4,aab,88A,2,Arapesh (Abu),Torricelli
...,...,...,...,...,...
76470,zzo,144B,3,Zapotec (Zoogocho),Oto-Manguean
76471,zzo,144T,1,Zapotec (Zoogocho),Oto-Manguean
76472,zzo,144V,1,Zapotec (Zoogocho),Oto-Manguean
76473,zzo,144W,6,Zapotec (Zoogocho),Oto-Manguean


### preprocessing for U2

86A Order of Genitive and Noun
85A Order of Adposition and Noun Phrase

In [34]:
A86 = df[df['Parameter_ID'] == '86A']
A85 = df[df['Parameter_ID'] == '85A']

In [35]:
A86.head(10)

Unnamed: 0,Language_ID,Parameter_ID,Value,Name,Family
32,aar,86A,1,Aari,Afro-Asiatic
55,aba,86A,1,Abau,Sepik
123,abi,86A,3,Abipón,Guaicuruan
241,abk,86A,1,Abkhaz,Northwest Caucasian
349,abn,86A,3,Arabana,Pama-Nyungan
372,abo,86A,2,Arbore,Afro-Asiatic
400,abu,86A,1,Abun,West Papuan
450,abv,86A,1,Abui,Timor-Alor-Pantar
515,ace,86A,2,Acehnese,Austronesian
570,acg,86A,1,Achagua,Arawakan


#### 86A:
- Value 1 = Genitive-Noun
- Value 2 = Noun-Genitive
- Value 3 = No dominant order

In [36]:
A86.Value = A86.Value.map({1:'Genitive-Noun', 2:'Noun-Genitive', 3:'No_dominant_genitive_noun_order'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A86.Value = A86.Value.map({1:'Genitive-Noun', 2:'Noun-Genitive', 3:'No_dominant_genitive_noun_order'})


In [37]:
A86.Value.value_counts()

Value
Genitive-Noun                      685
Noun-Genitive                      468
No_dominant_genitive_noun_order     96
Name: count, dtype: int64

In [38]:
A86 = A86.dropna(axis=0)

In [39]:
A86.Value.value_counts()

Value
Genitive-Noun                      648
Noun-Genitive                      460
No_dominant_genitive_noun_order     92
Name: count, dtype: int64

In [40]:
A86.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1200 entries, 32 to 76434
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Language_ID   1200 non-null   object
 1   Parameter_ID  1200 non-null   object
 2   Value         1200 non-null   object
 3   Name          1200 non-null   object
 4   Family        1200 non-null   object
dtypes: object(5)
memory usage: 56.2+ KB


In [41]:
A86.Parameter_ID = 'A86'


In [42]:
import os
A86.to_csv('../Processed/86A.csv', index = False)

#### A85:
- Value 1 = postpositions
- Value 2 = prepositions
- Value 3 = inpositions
- Value 4 = no dominant order
- Value 5 = no adpositions

In [43]:
A85.head(10)

Unnamed: 0,Language_ID,Parameter_ID,Value,Name,Family
31,aar,85A,1,Aari,Afro-Asiatic
54,aba,85A,1,Abau,Sepik
122,abi,85A,2,Abipón,Guaicuruan
240,abk,85A,1,Abkhaz,Northwest Caucasian
399,abu,85A,2,Abun,West Papuan
449,abv,85A,5,Abui,Timor-Alor-Pantar
514,ace,85A,2,Acehnese,Austronesian
569,acg,85A,1,Achagua,Arawakan
620,acl,85A,2,Acholi,Eastern Sudanic
665,acm,85A,2,Achumawi,Hokan


In [44]:
A85.Value.value_counts()

Value
1    577
2    511
4     58
5     30
3      8
Name: count, dtype: int64

In [45]:
A85.Value = A85.Value.map({1:'Postpositions', 2:'Prepositions', 3:'Inpositions', 4: 'No dominant adposition order', 5:'No adpositions'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A85.Value = A85.Value.map({1:'Postpositions', 2:'Prepositions', 3:'Inpositions', 4: 'No dominant adposition order', 5:'No adpositions'})


In [46]:
A85.Value.value_counts()

Value
Postpositions                   577
Prepositions                    511
No dominant adposition order     58
No adpositions                   30
Inpositions                       8
Name: count, dtype: int64

In [47]:
A85 = A85.dropna(axis=0)

In [48]:
A85.Parameter_ID = 'A85'

In [49]:
A85.to_csv('../Processed/85A.csv', index = False)