# preprocessing

In [1]:
import pandas as pd

### Mapping

In [2]:
fam = pd.read_csv('../data/languagesMSD.csv')

fam = fam.drop(['NameOLD', 'MacroareaOLD', 'MacroareaNEW',
       'LatitudeOLD', 'LatitudeNEW', 'LongitudeOLD', 'LongitudeNEW',
       'GlottocodeOLD', 'GlottocodeNEW', 'ISO639P3code',
       'FamilyNEW', 'SubfamilyOLD', 'SubfamilyNEW', 'GenusOLD', 'GenusNEW',
       'GenusIcon', 'ISO_codesOLD', 'ISO_codesNEW', 'Samples_100',
       'Samples_200', 'Country_IDOLD', 'Country_IDNEW', 'Source'], axis = 1)
fam.columns = ['Language_ID','Name', 'Familiy']

fams = {}
names = {}
i = 0
for i in range(len(fam.index)):
    abbv, family, name = fam.iloc[i][0], fam.iloc[i][2], fam.iloc[i][1]
    fams[abbv] = family
    names[abbv] = name

  abbv, family, name = fam.iloc[i][0], fam.iloc[i][2], fam.iloc[i][1]


### Loading data set

In [3]:
df = pd.read_csv('../data/values.csv')

In [4]:
df['Name'] = df['Language_ID']
df['Family'] = df['Language_ID']

In [5]:
df['Name'] = df['Name'].map(names)
df['Family'] = df['Family'].map(fams)

In [6]:
df = df.drop(['ID','Code_ID', 'Comment', 'Source', 'Example_ID' ],axis=1)

In [7]:
df.dropna()

Unnamed: 0,Language_ID,Parameter_ID,Value,Name,Family
0,aab,81A,2,Arapesh (Abu),Torricelli
1,aab,82A,1,Arapesh (Abu),Torricelli
2,aab,83A,2,Arapesh (Abu),Torricelli
3,aab,87A,2,Arapesh (Abu),Torricelli
4,aab,88A,2,Arapesh (Abu),Torricelli
...,...,...,...,...,...
76470,zzo,144B,3,Zapotec (Zoogocho),Oto-Manguean
76471,zzo,144T,1,Zapotec (Zoogocho),Oto-Manguean
76472,zzo,144V,1,Zapotec (Zoogocho),Oto-Manguean
76473,zzo,144W,6,Zapotec (Zoogocho),Oto-Manguean


### Preprocessing for U41

81A Order of Subject, Object and Verb,
49A Number of Cases

In [8]:
A81 = df[df['Parameter_ID'] == '81A']
A49 = df[df['Parameter_ID'] == '49A']

In [9]:
A81.head(10)

Unnamed: 0,Language_ID,Parameter_ID,Value,Name,Family
0,aab,81A,2,Arapesh (Abu),Torricelli
51,aba,81A,1,Abau,Sepik
118,abi,81A,2,Abipón,Guaicuruan
236,abk,81A,1,Abkhaz,Northwest Caucasian
346,abn,81A,1,Arabana,Pama-Nyungan
369,abo,81A,1,Arbore,Afro-Asiatic
395,abu,81A,2,Abun,West Papuan
446,abv,81A,1,Abui,Timor-Alor-Pantar
510,ace,81A,7,Acehnese,Austronesian
566,acg,81A,2,Achagua,Arawakan


81A:
- Value 1 = SOV
- Value 2 = SVO
- Value 3 = VSO
- Value 4 = VOS
- Value 5 = OVS
- Value 6 = OSV
- Value 7 = No dominant order

In [10]:
A81.Value = A81.Value.map({1:'SOV', 2:'SVO', 3:'VSO', 4:'VOS', 5: 'OVS', 6: 'OSV', 7:'No dominant order'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A81.Value = A81.Value.map({1:'SOV', 2:'SVO', 3:'VSO', 4:'VOS', 5: 'OVS', 6: 'OSV', 7:'No dominant order'})


In [11]:
A81.Value.value_counts()

Value
SOV                  564
SVO                  488
No dominant order    189
VSO                   95
VOS                   25
OVS                   11
OSV                    4
Name: count, dtype: int64

In [12]:
A81 = A81.dropna(axis=0)

In [13]:
A81.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1320 entries, 0 to 76448
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Language_ID   1320 non-null   object
 1   Parameter_ID  1320 non-null   object
 2   Value         1320 non-null   object
 3   Name          1320 non-null   object
 4   Family        1320 non-null   object
dtypes: object(5)
memory usage: 61.9+ KB


In [14]:
import os
A81.to_csv('../Processed/81A.csv', index = False)

In [15]:
A49.head(10)

Unnamed: 0,Language_ID,Parameter_ID,Value,Name,Family
107,abi,49A,1,Abipón,Guaicuruan
203,abk,49A,2,Abkhaz,Northwest Caucasian
745,aco,49A,1,Acoma,Keresan
1077,aeg,49A,1,Arabic (Egyptian),Afro-Asiatic
1505,ain,49A,9,Ainu,
1849,ala,49A,7,Alamblak,Sepik
1957,alb,49A,4,Albanian,Indo-European
2048,ale,49A,2,Aleut,Eskimo-Aleut
2433,ame,49A,1,Amele,Trans-New Guinea
2560,amh,49A,2,Amharic,Afro-Asiatic


49A:
- Value 1 = No morphological case-marking
- Value 2 = 2 cases
- Value 3 = 3 cases
- Value 4 = 4 cases
- Value 5 = 5 cases
- Value 6 = 6-7 cases
- Value 7 = 8-9 cases
- Value 8 = 10 or more cases
- Value 9 = Exclusively borderline case-marking

In [16]:
A49.Value = A49.Value.map({1:'No morphological case-marking', 2:'2 cases', 3:'3 cases', 4:'4 cases', 5: '5 cases', 6: '6-7 cases', 7:'8-9 cases', 8:'10 or more cases', 9:'Exclusively borderline case-marking'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A49.Value = A49.Value.map({1:'No morphological case-marking', 2:'2 cases', 3:'3 cases', 4:'4 cases', 5: '5 cases', 6: '6-7 cases', 7:'8-9 cases', 8:'10 or more cases', 9:'Exclusively borderline case-marking'})


In [17]:
A49.Value.value_counts()

Value
No morphological case-marking          100
6-7 cases                               37
Exclusively borderline case-marking     24
10 or more cases                        24
2 cases                                 23
8-9 cases                               23
5 cases                                 12
4 cases                                  9
3 cases                                  9
Name: count, dtype: int64

In [18]:
A49 = A49.dropna(axis=0)

In [19]:
A49.info()

<class 'pandas.core.frame.DataFrame'>
Index: 242 entries, 107 to 76239
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Language_ID   242 non-null    object
 1   Parameter_ID  242 non-null    object
 2   Value         242 non-null    object
 3   Name          242 non-null    object
 4   Family        242 non-null    object
dtypes: object(5)
memory usage: 11.3+ KB


In [20]:
import os
A49.to_csv('../Processed/49A.csv', index = False)