# Create a Phovea Import Configuration

*Author: Alexander Lex; alex@sci.utah.edu *

This script creates an index.json file containing configurations to load/parse all the CSV files stored in the path specified below.


In [None]:
import pandas as pd
import numpy as np
import json
import math
from os import listdir
from os.path import isfile, join

## Importing files as DFs

In [None]:
# the path to the files
path = "";
# the file names, as an array of strings
files = []
# Can't distinguish some ID columns from ints. Specify the column headers that should be treated as IDs.
idtypes = [];    
# the delimiter used in the file
delimiter = ","
# If you want to specify a range of a numerical value manually. 
# This is a dictionary of "column header" (string) to a two-field array (from - to)
man_range = {}
# If you want to specify a range manually for all labels that contain the wildcard string. 
# Wildcard as key, two-from array as value
wildcard_man_range = {}

case = "suicide"
#case = "artists"
#case = "AIDS"

if(case == "artists"):
    path = "data/artists/"
    files = ["number_one_artists.csv"];
    delimiter = "\t"
elif(case == "suicide"):
    path = "data/suicide/"
    files = ["AllFamiliesAttributes.csv","AllFamiliesDescend.csv","TenFamiliesAttr.csv",
         "TenFamiliesAttrAnon.csv","TenFamiliesDescend.csv","TenFamiliesDescendAnon.csv", 
             "AllAutismFamiliesDescend.csv", "AllAutismFamiliesAttributes.csv"]
    idtypes = {"ID", "STATENUM", "KindredID", "RelativeID", "LabID", "MaID", "PaID", "OMEDID", "ArchivePersonID"}
    man_range["FirstBMI"] = [15,45]
    man_range["MaxBMI"] = [15,45]
    wildcard_man_range["Nr.Diag_"] = [0,10]
elif (case == "companies"):
    path = "data/companies/"
    files = ["Company Data  - Company Core Infor.csv", "Company Data  - Links.csv"]
elif (case == "AIDS"):
    path = "data/AIDS/"
    files = ["AIDS_Countries.csv", "AIDS_Years.csv"];

print(files)

In [None]:
file_map = {}
for file in files:
    file_map[file] = pd.read_csv(join(path, file), delimiter=delimiter)
    print(file)
    #print(file_map[file].columns.values)
    print(file_map[file].dtypes)

## Manual settings

There are some things that the script can't guess: 

In [None]:
# If a column has more than this many labels, we consider it text, not a category
categorical_label_threshold = 10

## Writing Configuration

In [None]:
def createColumnConfiguration(df):
    columns = []
    for column_name in df.columns.values[1:]:
        #print(column_name)
        column = df[column_name]
        col_desc = {}
        columns.append(col_desc)
        col_desc["name"] = column_name
        value = {}
        col_desc["value"] = value
        # IDTypes have to be listed manually
        if(column_name in idtypes):
            value["type"] = "idtype";
        # Because of missing values, pandas treats all numericals as floats
        elif(column.dtype == "float64" or column.dtype == "int64"):
            col_sum = column.sum()
            # Check whether it's actually an integer
            if(math.isclose(round(col_sum), col_sum, rel_tol=0.0001)):            
                value["type"] = "int"
            else:
                value["type"] = "real"
            range = []
            # Check for manually defined ranges
            if(column_name in man_range):
                range = man_range[column_name]
            else:  
                range.append(0)
                range.append(float(column.max()))
                
                for man_range_value in wildcard_man_range:
                    if(man_range_value in column_name):
                        print("Tada", column_name)
                        range = wildcard_man_range[man_range_value]
                        break;
                #column.min()
                # range stats at 0 goes to max
               
            value["range"] = range
        # If more than threshold unique values, we treat it as string
        elif((len(column.unique()) > categorical_label_threshold)):
            value["type"] = "string"
        else:
            value["type"] = "categorical"
            categories = []
            value["categories"] = categories;
            for category in column.unique():
                isNaN = False;
                try:
                    isNaN = np.isnan(category)
                    #print("Nan here", isNan, category)
                except TypeError:
                    pass
                
                if(isNaN):
                    continue
                
                category_desc = {}
                
                if(type(category) == bool):
                    if(category):
                        category_desc["name"] = "True"
                    else:
                        category_desc["name"] = "False"
                else:
                    category_desc["name"] = str(category)
                category_desc["color"] = "red"
                categories.append(category_desc)
    return columns;    
        
    #print(json.dumps(columns, separators=(',', ':')))
        
        

In [None]:
import_template = """
{"name": "",
  "type": "table",
  "id": "",
  "separator": ",",
  "quotechar": "",
  "description": "autogenerated",
  "creator": "autogenerated",
  "path": "",
  "idcolumn": 0,
  "idtype": "",
  "size": [],
  "columns": []
}"""


In [None]:
def createJsonForFile(filename):
    df = file_map[filename]
    columns = createColumnConfiguration(df)
    json_config = json.loads(import_template)
    json_config["separator"] = delimiter
    json_config["quotechar"] = '"'
    json_config["columns"] = columns
    name = filename.split('.')[0]
    json_config["id"] = name
    json_config["path"] = filename
    json_config["name"] = name
    json_config["idtype"] = df.columns.values[0]
    size = [df.shape[0],df.shape[1]-1]
    json_config["size"] = size
    return json_config

In [None]:
configs = []
for k, v in file_map.items():
    json_config = createJsonForFile(k);
    # print(json_config)
    configs.append(json_config)
    

json_dump = json.dumps(configs, separators=(',', ':'), sort_keys=True, indent=2)
f = open(path+"index.json", 'w')
f.write(json_dump)
print(json_dump)
f.close()

