# preprocessing

In [1]:
import pandas as pd

### Lang_ID to Lang name map

In [2]:
mp = pd.read_csv('../data/language_names.csv')

In [3]:
mp.head(5)

Unnamed: 0,ID,Language_ID,Name,Provider
0,1,xoo,!Xóõ,ethnologue
1,2,arx,'Are'are,ethnologue
2,3,apk gan,A-Pucikwar,ethnologue
3,4,aar,Aari,ethnologue
4,5,aba,Abau,ethnologue


In [4]:
mp.drop(['ID', 'Provider'], axis = 1)
langs = {}
i = 0
for i in range(len(mp.index)):
    abbv, name = mp.iloc[i][1], mp.iloc[i][2]
    langs[abbv] = name

### langID to family

In [5]:
fam = pd.read_csv('../data/languagesMSD.csv')

In [6]:
fam = fam.drop(['NameOLD', 'NameNEW', 'MacroareaOLD', 'MacroareaNEW',
       'LatitudeOLD', 'LatitudeNEW', 'LongitudeOLD', 'LongitudeNEW',
       'GlottocodeOLD', 'GlottocodeNEW', 'ISO639P3code',
       'FamilyNEW', 'SubfamilyOLD', 'SubfamilyNEW', 'GenusOLD', 'GenusNEW',
       'GenusIcon', 'ISO_codesOLD', 'ISO_codesNEW', 'Samples_100',
       'Samples_200', 'Country_IDOLD', 'Country_IDNEW', 'Source'], axis = 1)

In [7]:
fam.columns = ['Language_ID', 'Familiy']

In [8]:
fams = {}
i = 0
for i in range(len(fam.index)):
    abbv, family = fam.iloc[i][0], fam.iloc[i][1]
    fams[abbv] = family

### Loading data set

In [9]:
df = pd.read_csv('../data/values.csv')

In [10]:
df['Name'] = df['Language_ID']
df['Family'] = df['Language_ID']

In [11]:
df['Name'] = df['Name'].map(langs)
df['Family'] = df['Family'].map(fams)

In [12]:
df = df.drop(['ID','Code_ID', 'Comment', 'Source', 'Example_ID' ],axis=1)

In [13]:
df.dropna()

Unnamed: 0,Language_ID,Parameter_ID,Value,Name,Family
22,aar,26A,2,Ari,Afro-Asiatic
23,aar,33A,9,Ari,Afro-Asiatic
24,aar,37A,3,Ari,Afro-Asiatic
25,aar,38A,4,Ari,Afro-Asiatic
26,aar,51A,1,Ari,Afro-Asiatic
...,...,...,...,...,...
76470,zzo,144B,3,"Zapotec, Zoogocho",Oto-Manguean
76471,zzo,144T,1,"Zapotec, Zoogocho",Oto-Manguean
76472,zzo,144V,1,"Zapotec, Zoogocho",Oto-Manguean
76473,zzo,144W,6,"Zapotec, Zoogocho",Oto-Manguean


### Preprocessing for U45

44A Gender Distinctions in Independent Personal Pronouns

In [14]:
A44 = df[df['Parameter_ID'] == '44A']

In [15]:
A44.head(10)

Unnamed: 0,Language_ID,Parameter_ID,Value,Name,Family
105,abi,44A,6,Abipon,Guaicuruan
199,abk,44A,1,Abkhaz,Northwest Caucasian
338,abn,44A,6,Arabana,Pama-Nyungan
384,abu,44A,3,Yimbun,West Papuan
497,ace,44A,6,Achinese,Austronesian
655,acm,44A,6,Achumawi,Hokan
976,adz,44A,6,Adzera,Austronesian
1073,aeg,44A,1,Egyptian Colloquial Arabic,Afro-Asiatic
1501,ain,44A,6,Ainu,
1845,ala,44A,3,Alamblak,Sepik


44A:
- Value 1 = In 3rd person + 1st and/or 2nd person
- Value 2 = 3rd person only, but also non-singular
- Value 3 = 3rd person singular only
- Value 4 = 1st or 2nd person but not 3rd
- Value 5 = 3rd person non-singular only
- Value 6 = No gender distinctions

In [17]:
A44.Value = A44.Value.map({1:'In 3rd person + 1st and/or 2nd person', 2:'3rd person only, but also non-singular', 3:'3rd person singular only', 4:'1st or 2nd person but not 3rd', 5:'3rd person non-singular only', 6: 'No gender distinctions'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A44.Value = A44.Value.map({1:'In 3rd person + 1st and/or 2nd person', 2:'3rd person only, but also non-singular', 3:'3rd person singular only', 4:'1st or 2nd person but not 3rd', 5:'3rd person non-singular only', 6: 'No gender distinctions'})


In [18]:
A44.Value.value_counts()

Value
No gender distinctions                    254
3rd person singular only                   61
3rd person only, but also non-singular     42
In 3rd person + 1st and/or 2nd person      18
1st or 2nd person but not 3rd               2
3rd person non-singular only                1
Name: count, dtype: int64

In [19]:
A44 = A44.dropna(axis=0)

In [20]:
A44.info()

<class 'pandas.core.frame.DataFrame'>
Index: 338 entries, 105 to 76235
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Language_ID   338 non-null    object
 1   Parameter_ID  338 non-null    object
 2   Value         338 non-null    object
 3   Name          338 non-null    object
 4   Family        338 non-null    object
dtypes: object(5)
memory usage: 15.8+ KB


In [21]:
import os
A44.to_csv('../Processed/44A.csv', index = False)