# preprocessing

In [2]:
import pandas as pd

### Lang_ID to Lang name map

In [3]:
mp = pd.read_csv('../data/language_names.csv')

In [4]:
mp.head(5)

Unnamed: 0,ID,Language_ID,Name,Provider
0,1,xoo,!Xóõ,ethnologue
1,2,arx,'Are'are,ethnologue
2,3,apk gan,A-Pucikwar,ethnologue
3,4,aar,Aari,ethnologue
4,5,aba,Abau,ethnologue


In [5]:
mp.drop(['ID', 'Provider'], axis = 1)
langs = {}
i = 0
for i in range(len(mp.index)):
    abbv, name = mp.iloc[i][1], mp.iloc[i][2]
    langs[abbv] = name

### langID to family

In [6]:
fam = pd.read_csv('../data/languagesMSD.csv')

In [7]:
fam = fam.drop(['NameOLD', 'NameNEW', 'MacroareaOLD', 'MacroareaNEW',
       'LatitudeOLD', 'LatitudeNEW', 'LongitudeOLD', 'LongitudeNEW',
       'GlottocodeOLD', 'GlottocodeNEW', 'ISO639P3code',
       'FamilyNEW', 'SubfamilyOLD', 'SubfamilyNEW', 'GenusOLD', 'GenusNEW',
       'GenusIcon', 'ISO_codesOLD', 'ISO_codesNEW', 'Samples_100',
       'Samples_200', 'Country_IDOLD', 'Country_IDNEW', 'Source'], axis = 1)

In [8]:
fam.columns = ['Language_ID', 'Familiy']

In [9]:
fams = {}
i = 0
for i in range(len(fam.index)):
    abbv, family = fam.iloc[i][0], fam.iloc[i][1]
    fams[abbv] = family

### Loading data set

In [10]:
df = pd.read_csv('../data/values.csv')

In [11]:
df['Name'] = df['Language_ID']
df['Family'] = df['Language_ID']

In [12]:
df['Name'] = df['Name'].map(langs)
df['Family'] = df['Family'].map(fams)

In [13]:
df = df.drop(['ID','Code_ID', 'Comment', 'Source', 'Example_ID' ],axis=1)

In [14]:
df.dropna()

Unnamed: 0,Language_ID,Parameter_ID,Value,Name,Family
22,aar,26A,2,Ari,Afro-Asiatic
23,aar,33A,9,Ari,Afro-Asiatic
24,aar,37A,3,Ari,Afro-Asiatic
25,aar,38A,4,Ari,Afro-Asiatic
26,aar,51A,1,Ari,Afro-Asiatic
...,...,...,...,...,...
76470,zzo,144B,3,"Zapotec, Zoogocho",Oto-Manguean
76471,zzo,144T,1,"Zapotec, Zoogocho",Oto-Manguean
76472,zzo,144V,1,"Zapotec, Zoogocho",Oto-Manguean
76473,zzo,144W,6,"Zapotec, Zoogocho",Oto-Manguean


### Preprocessing for U24

90A Order of Relative Clause and Noun, 
85A Order of Adposition and Noun Phrase, 
87A Order of Adjective and Noun

In [15]:
A90 = df[df['Parameter_ID'] == '90A']
A87 = df[df['Parameter_ID'] == '87A']

In [16]:
A90.head(3)

Unnamed: 0,Language_ID,Parameter_ID,Value,Name,Family
36,aar,90A,1,Ari,Afro-Asiatic
126,abi,90A,1,Abipon,Guaicuruan
245,abk,90A,2,Abkhaz,Northwest Caucasian


90A:
- Value 1 = Noun-Relative clause
- Value 2 = Relative clause-Noun
- Value 3 = Internally headed
- Value 4 = Correlative
- Value 5 = Adjoined
- Value 6 = Doubly headed
- Value 7 = Mixed

In [19]:
A90.Value = A90.Value.map({1:'Noun-Relative clause', 2:'Relative clause-Noun', 3:'Internally headed', 4:'Correlative', 5:'Adjoined', 6:'Doubly headed', 7:'Mixed'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A90.Value = A90.Value.map({1:'Noun-Relative clause', 2:'Relative clause-Noun', 3:'Internally headed', 4:'Correlative', 5:'Adjoined', 6:'Doubly headed', 7:'Mixed'})


In [20]:
A90.Value.value_counts()

Value
Noun-Relative clause    579
Relative clause-Noun    141
Mixed                    64
Internally headed        24
Adjoined                  8
Correlative               7
Doubly headed             1
Name: count, dtype: int64

In [21]:
A90 = A90.dropna(axis=0)

In [22]:
A90.info()

<class 'pandas.core.frame.DataFrame'>
Index: 769 entries, 36 to 76454
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Language_ID   769 non-null    object
 1   Parameter_ID  769 non-null    object
 2   Value         769 non-null    object
 3   Name          769 non-null    object
 4   Family        769 non-null    object
dtypes: object(5)
memory usage: 36.0+ KB


In [23]:
import os
A90.to_csv('../Processed/90A.csv', index = False)

In [24]:
A87.head(3)

Unnamed: 0,Language_ID,Parameter_ID,Value,Name,Family
3,aab,87A,2,,Torricelli
33,aar,87A,2,Ari,Afro-Asiatic
124,abi,87A,3,Abipon,Guaicuruan


87A:
- Value 1 = Adjective-Noun
- Value 2 = Noun-Adjective
- Value 3 = No dominant order
- Value 4 = Only internally-headed relative clauses

In [25]:
A87.Value = A87.Value.map({1:'Adjective-Noun', 2:'Noun-Adjective', 3:'No dominant order', 4:'Only internally-headed relative clauses'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A87.Value = A87.Value.map({1:'Adjective-Noun', 2:'Noun-Adjective', 3:'No dominant order', 4:'Only internally-headed relative clauses'})


In [26]:
A87.Value.value_counts()

Value
Noun-Adjective                             879
Adjective-Noun                             373
No dominant order                          110
Only internally-headed relative clauses      5
Name: count, dtype: int64

In [27]:
A87 = A87.dropna(axis=0)

In [28]:
A87.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1264 entries, 33 to 76451
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Language_ID   1264 non-null   object
 1   Parameter_ID  1264 non-null   object
 2   Value         1264 non-null   object
 3   Name          1264 non-null   object
 4   Family        1264 non-null   object
dtypes: object(5)
memory usage: 59.2+ KB


In [29]:
import os
A87.to_csv('../Processed/87A.csv', index = False)