In [6]:
import os, sys
import pandas as pd
import numpy as np
import re

pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
sys.path.append('../src')

from fixtures import df_to_json_list, write_fixture_to_json

In [7]:
def load_sheet_from_xl(fname, sheet_name):
    """
    load sheet from xl file
    """
    fldr_path = '..','data','raw'
    fpath = os.path.join(*fldr_path, fname)
    xl = pd.ExcelFile(fpath)
    df = xl.parse(sheet_name)
    
    return df

In [8]:
def clean_df_resources_languages(df):
    
    cols_to_keep = ['index', 'language','index_language', 'equivalentClasses',]   # language hyperlinks example url https://bioportal.bioontology.org/ontologies/SNOMEDCT/?p=classes&conceptid=http%3A%2F%2Fpurl.bioontology.org%2Fontology%2FSNOMEDCT%2F297301005
    df = df.loc[: ,cols_to_keep]

    #handle NaNs
    df['index_language'] = df['index_language'].astype(pd.Int64Dtype()) # allows column to have NaNs pd.Int64Dtype()
    df['equivalentClasses'] = df['equivalentClasses'].astype(str)

    df = df.set_index('index')
    
    #rename cols # to avoid a naming clashes in django.
    rename_dict = {'language':'name', 
                   'index_language':'parent_language'}
    
    df = df.rename(columns=rename_dict)

    return df

In [9]:
def clean_df_disorders_disorder_categories(df):
    
    cols_to_keep = ['index', 'disorder_category']   # language hyperlinks example url https://bioportal.bioontology.org/ontologies/SNOMEDCT/?p=classes&conceptid=http%3A%2F%2Fpurl.bioontology.org%2Fontology%2FSNOMEDCT%2F297301005
    df = df.loc[: ,cols_to_keep]

    #handle NaNs
    df = df.dropna()
    df = df.set_index('index')
    
    #rename cols # to avoid a naming clashes in django.
    rename_dict = {'disorder_category':'name'}
    df = df.rename(columns=rename_dict)

    return df
    

In [10]:
def cleaning_func_factory(fname, sheet_name):
    """
    return the appropriate df cleaning function by looking up a dictionary
    the functions should be saved in the dictionary in the format FileName__SheetName
    """
    file_sheet = fname.split('.')[0] + "__" + sheet_name
    
    cleaning_funcs = {}
    cleaning_funcs["resources__languages"] = clean_df_resources_languages
    cleaning_funcs["disorders__disorder_categories"] = clean_df_disorders_disorder_categories
    
    return cleaning_funcs[file_sheet]

In [13]:
def process_one_sheet(fname, sheet_name, app_name, model_name):
    
    df = load_sheet_from_xl(fname, sheet_name)
    # look up the cleaning function based on file and sheet name 
    df = cleaning_func_factory(fname, sheet_name)(df)

    fname, fixture_lst = df_to_json_list(df,
                                        app_name,
                                        model_name,
                                        file_name_modifier='',
                                        use_df_index_as_pk=True,
                                        create_datetimefield_name=None,
                                        created_by_field_name=None)

    write_fixture_to_json(fixture_lst, fname, output_folder='default')
    
    return

In [12]:
# load sheet from xl file
sheet_name = 'languages'
fname = 'resources.xlsx'
# variables for Django schema
model_name = 'Language'
app_name = 'resources'

inputs_lst = [
    {
    'sheet_name' : 'languages',
    'fname' : 'resources.xlsx',
    'model_name' : 'Language',
    'app_name' : 'resources',
    },
    {
    'sheet_name' : 'disorder_categories',
    'fname' : 'disorders.xlsx',
    'model_name' : 'DisorderCategory',
    'app_name' : 'disorders',
    },
    ]

In [15]:
for d in inputs_lst:
    try: 
        process_one_sheet(d['fname'], d['sheet_name'], d['app_name'], d['model_name'])
    except FileExistsError as e:
        print(e)

did not save, file already exists at: ../data/processed/fixtures/Language.json
