In [1]:
import os, sys
import pandas as pd
import numpy as np
import re

pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
sys.path.append('../src')

from fixtures import df_to_json_list, write_fixture_to_json

In [2]:
def load_sheet_from_xl(fname, sheet_name):
    """
    load sheet from xl file
    """
    fldr_path = '..','data','raw'
    fpath = os.path.join(*fldr_path, fname)
    xl = pd.ExcelFile(fpath)
    df = xl.parse(sheet_name)
    
    return df

In [21]:
def clean_df_resources_languages(df):
    
    cols_to_keep = ['index', 'language','index_language', 'equivalentClasses',]   # language hyperlinks example url https://bioportal.bioontology.org/ontologies/SNOMEDCT/?p=classes&conceptid=http%3A%2F%2Fpurl.bioontology.org%2Fontology%2FSNOMEDCT%2F297301005
    df = df.loc[: ,cols_to_keep]

    #handle NaNs
    df['index_language'] = df['index_language'].astype(pd.Int64Dtype()) # allows column to have NaNs pd.Int64Dtype()
    df['equivalentClasses'] = df['equivalentClasses'].astype(str)

    df = df.set_index('index')
    
    #rename cols # to avoid a naming clashes in django.
    rename_dict = {'language':'name', 
                   'index_language':'parent_language'}
    
    df = df.rename(columns=rename_dict)

    return df

In [4]:

def cleaning_func_factory(fname, sheet_name):
    
    file_sheet = fname.split('.')[0] + "_" + sheet_name
    
    cleaning_funcs = {}
    cleaning_funcs["resources_languages"] = clean_df_resources_languages
    
    return cleaning_funcs[file_sheet]

In [22]:
# load sheet from xl file
sheet_name = 'languages'
fname = 'resources.xlsx'
# variables for Django schema
model_name = 'Language'
app_name = 'resources'

df = load_sheet_from_xl(fname, sheet_name)
# look up the cleaning function based on file and sheet name 
df = cleaning_func_factory(fname, sheet_name)(df)

In [23]:
fname, fixture_lst = df_to_json_list(df,
                                    app_name,
                                    model_name,
                                    file_name_modifier='',
                                    use_df_index_as_pk=True,
                                    create_datetimefield_name=None,
                                    created_by_field_name=None)

In [24]:
write_fixture_to_json(fixture_lst, fname, output_folder='default')

In [25]:
df.head()

Unnamed: 0_level_0,name,parent_language,equivalentClasses
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100,Afro-Asiatic language,,SNOMEDCT:297290004
200,Berber language,100.0,SNOMEDCT:297291000
300,Kabyle language,200.0,SNOMEDCT:297292007
400,Riffian language,200.0,SNOMEDCT:297293002
500,Shawia language,200.0,SNOMEDCT:297294008


In [20]:
df.rename(columns={'language':'name','index_language':'parent_language'})

Unnamed: 0_level_0,name,parent_language,equivalentClasses
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100,Afro-Asiatic language,,SNOMEDCT:297290004
200,Berber language,100.0,SNOMEDCT:297291000
300,Kabyle language,200.0,SNOMEDCT:297292007
400,Riffian language,200.0,SNOMEDCT:297293002
500,Shawia language,200.0,SNOMEDCT:297294008
600,Shluh language,200.0,SNOMEDCT:297295009
700,Tamazight language,200.0,SNOMEDCT:297296005
800,Tuareg language,200.0,SNOMEDCT:297297001
900,Chadic language,100.0,SNOMEDCT:297298006
1000,Hausa language,900.0,SNOMEDCT:297299003
