# Create a Phovea Import Configuration

*Author: Alexander Lex; alex@sci.utah.edu *

This script creates an index.json file containing configurations to load/parse all the CSV files stored in the path specified below.


In [40]:
import pandas as pd
import numpy as np
import json
import math
from os import listdir
from os.path import isfile, join

## Importing files as DFs

In [41]:
path = "data/"
# Set the delimiter to what's used in the file

#files = ["number_one_artists.csv"];
#delimiter = "\t"

files = ["AIDS_Countries.csv", "AIDS_Years.csv"];
delimiter = ","

# Auto read all files in csv file
#files = [f for f in listdir(path) if (isfile(join(path, f)) and ".csv" in f)]
print(files)


['AIDS_Countries.csv', 'AIDS_Years.csv']


In [42]:
file_map = {}
for file in files:
    file_map[file] = pd.read_csv(join(path, file), delimiter=delimiter)
    print(file)
    #print(file_map[file].columns.values)
    print(file_map[file].dtypes)

AIDS_Countries.csv
Country                                           object
Sex before 15 (15-24, %, 2015)                   float64
Condom use at last sex (%, 2015)                 float64
N. ppl on ART (2015)                             float64
% ppl. on ART ( 2015)                            float64
Ppl knowing they have HIV (%, 2015)              float64
HIV prevention knowledge (age 15-24, %, 2015)    float64
Discriminatory attitude (%)                      float64
Discriminatory attitude scale                     object
Human devel. index                                object
Continent                                         object
HIV restrictions on entry, stay, or residence     object
Population (2017)                                  int64
Yearly change (%)                                float64
Net change                                         int64
Density (P/SqKm)                                   int64
Land Area (SqKm)                                   int64
Migrants (ne

## Manual settings

There are some things that the script can't guess: 

In [43]:
# Can't distinguish some ID columns from ints
idtypes = {"ID", "STATENUM", "KindredID", "RelativeID", "LabID", "MaID", "PaID", "OMEDID", "ArchivePersonID"}
# If a column has more than this many labels, we consider it text, not a category
categorical_label_threshold = 10
# If you want to specify a range manually
man_range = {}
man_range["FirstBMI"] = [15,45]
man_range["MaxBMI"] = [15,45]

## Writing Configuration

In [44]:
def createColumnConfiguration(df):
    columns = []
    for column_name in df.columns.values[1:]:
        #print(column_name)
        column = df[column_name]
        col_desc = {}
        columns.append(col_desc)
        col_desc["name"] = column_name
        value = {}
        col_desc["value"] = value
        # IDTypes have to be listed manually
        if(column_name in idtypes):
            value["type"] = "idtype";
        # Because of missing values, pandas treats all numericals as floats
        elif(column.dtype == "float64" or column.dtype == "int64"):
            col_sum = column.sum()
            # Check whether it's actually an integer
            if(math.isclose(round(col_sum), col_sum, rel_tol=0.0001)):            
                value["type"] = "int"
            else:
                value["type"] = "real"
            range = []
            # Check for manually defined ranges
            if(column_name in man_range):
                range = man_range[column_name]
            else:
                #column.min()
                # range stats at 0 goes to max
                range.append(0)
                range.append(float(column.max()))
            value["range"] = range
        # If more than threshold unique values, we treat it as string
        elif((len(column.unique()) > categorical_label_threshold)):
            value["type"] = "string"
        else:
            value["type"] = "categorical"
            categories = []
            value["categories"] = categories;
            for category in column.unique():
                isNaN = False;
                try:
                    isNaN = np.isnan(category)
                    #print("Nan here", isNan, category)
                except TypeError:
                    pass
                
                if(isNaN):
                    continue
                
                category_desc = {}
                
                if(type(category) == bool):
                    if(category):
                        category_desc["name"] = "True"
                    else:
                        category_desc["name"] = "False"
                else:
                    category_desc["name"] = category
                category_desc["color"] = "red"
                categories.append(category_desc)
    return columns;    
        
    #print(json.dumps(columns, separators=(',', ':')))
        
        

In [45]:
import_template = """
{"name": "",
  "type": "table",
  "id": "",
  "separator": ",",
  "quotechar": "",
  "description": "autogenerated",
  "creator": "autogenerated",
  "path": "",
  "idcolumn": 0,
  "idtype": "",
  "size": [],
  "columns": []
}"""


In [46]:
def createJsonForFile(filename):
    df = file_map[filename]
    columns = createColumnConfiguration(df)
    json_config = json.loads(import_template)
    json_config["separator"] = delimiter
    json_config["quotechar"] = '"'
    json_config["columns"] = columns
    name = filename.split('.')[0]
    json_config["id"] = name
    json_config["path"] = filename
    json_config["name"] = name
    json_config["idtype"] = df.columns.values[0]
    size = [df.shape[0],df.shape[1]-1]
    json_config["size"] = size
    return json_config

In [47]:
configs = []
for k, v in file_map.items():
    json_config = createJsonForFile(k);
    # print(json_config)
    configs.append(json_config)
    

json_dump = json.dumps(configs, separators=(',', ':'), sort_keys=True, indent=2)
f = open(path+"index.json", 'w')
f.write(json_dump)
print(json_dump)
f.close()



[
  {
    "columns":[
      {
        "name":"Decades",
        "value":{
          "categories":[
            {
              "color":"red",
              "name":"1990s"
            },
            {
              "color":"red",
              "name":"2000s"
            },
            {
              "color":"red",
              "name":"2010s"
            }
          ],
          "type":"categorical"
        }
      },
      {
        "name":"HAART availability",
        "value":{
          "categories":[
            {
              "color":"red",
              "name":"pre HAART period"
            },
            {
              "color":"red",
              "name":"A Decade of HAART"
            },
            {
              "color":"red",
              "name":"multiple medications available"
            }
          ],
          "type":"categorical"
        }
      }
    ],
    "creator":"autogenerated",
    "description":"autogenerated",
    "id":"AIDS_Years",
    "idcolumn":0,
    "i