In [1]:
import preprocessing.utils.preprocessing_utils

# Preprocessing basics
1. Enter filename
2. Import file
3. Convert data to tensor

In [2]:
# a correct example
filename = "data/sample_policing_input.xlsx"
df, id_to_covariates = preprocessing.utils.preprocessing_utils.import_excel(filename)
tensor = preprocessing.utils.preprocessing_utils.convert_to_tensor(df) # 

tenor is an instance of Tensor class with
- data: (N × T x D x M) Numpy array
- method print_info: basic summary of data
- method print_table: table view of data

## Method print_info(verbose)
- verbose = False by default 
- prints information about number and list of units, timestamps, interventions and outcomes
- for each outcome variable, if verbose = False (True), show statistics (list) of units of which data is available under a timestamp (intervention). Note that data is considered available if having value under any intervention (timestamp) for that timestamp (intervention))

In [3]:
# verbose is set to be False by default to print succinct information
tensor.print_info()

                               Summary of Data                                               
No. units: 3    List of units: [0, 1, 2]
No. measurements: 6    List of measurements: [1, 2, 5, 0, 3, 4]
No. interventions: 4    List of interventions: [1, 2, 5, 0]
No. outcomes: 2    List of outcomes: [force, complaint]
                           Units under Measurements                                                   
------------------------------------------------------------------------------
                                    force                                          
------------------------------------------------------------------------------
Statistics of number of units under a measurement:
Max: 3 units    Median: 1 units    Min: 1 units    Mean: 1.67 units
------------------------------------------------------------------------------
                                  complaint                                            
------------------------------------------------------

## Method print_table(x_axis, y_axis, constant, entry, show_data):
- show_data is set to be False by default
- entry needs to be an outcome variable in data
- x_axis and y_axis need to be one of unit/intervention/timestamp and x_axis and y_axis are different
- constant needs to be the name in the other column among unit/intervention/timestamp apart from x_axis and y_axis
- rows and columns with the largest numbers of values shown on upper left cornors

In [4]:
tensor.print_table(x_axis = 'unit', y_axis = 'intervention', constant = 2, entry = 'force')

Under Outcome force and Measurement 2 (X axis: unit, Y axis: intervention)
+---+---+---+---+
|   | 1 | 0 | 2 |
+---+---+---+---+
| 1 | * | * | * |
| 2 | * | * |   |
| 0 | * |   | * |
| 5 |   |   |   |
+---+---+---+---+


In [5]:
tensor.print_table(x_axis = 'unit', y_axis = 'intervention', constant = 2, entry = 'force', show_data=True)

Under Outcome force and Measurement 2 (X axis: unit, Y axis: intervention)
+---+------+------+------+
|   |  1   |  0   |  2   |
+---+------+------+------+
| 1 | 33.0 | 15.0 | 51.0 |
| 2 | 35.0 | 17.0 |      |
| 0 | 31.0 |      | 49.0 |
| 5 |      |      |      |
+---+------+------+------+


In [6]:
tensor.print_table(x_axis = 'unit', y_axis = 'intervention', constant = 0, entry = 'force')

Under Outcome force and Measurement 0 (X axis: unit, Y axis: intervention)
+---+---+---+---+
|   | 1 | 0 | 2 |
+---+---+---+---+
| 2 | * |   |   |
| 1 |   |   |   |
| 5 |   |   |   |
| 0 |   |   |   |
+---+---+---+---+


In [7]:
tensor.print_table(x_axis = 'unit', y_axis = 'intervention', constant = 0, entry = 'force', show_data = True)

Under Outcome force and Measurement 0 (X axis: unit, Y axis: intervention)
+---+------+---+---+
|   |  1   | 0 | 2 |
+---+------+---+---+
| 2 | 23.0 |   |   |
| 1 |      |   |   |
| 5 |      |   |   |
| 0 |      |   |   |
+---+------+---+---+


In [8]:
# another correct example
filename = "data/sample_input.xlsx"
df, id_to_covariates = preprocessing.utils.preprocessing_utils.import_excel(filename)
tensor = preprocessing.utils.preprocessing_utils.convert_to_tensor(df)

In [10]:
# Set verbose = True to see more detailed information
tensor.print_info()

                               Summary of Data                                               
No. units: 6    List of units: [1, 2, 3, 4, 5, a]
No. measurements: 5    List of measurements: [0, 1, 2, 3, 4]
No. interventions: 5    List of interventions: [0, 2, 3, 1, 4]
No. outcomes: 2    List of outcomes: [outcome_1, outcome_2]
                           Units under Measurements                                                   
------------------------------------------------------------------------------
                                  outcome_1                                            
------------------------------------------------------------------------------
Statistics of number of units under a measurement:
Max: 6 units    Median: 6 units    Min: 6 units    Mean: 6.00 units
------------------------------------------------------------------------------
                                  outcome_2                                            
-------------------------------------

In [11]:
# Set verbose = True to see more detailed information
tensor.print_info(verbose = True)

                               Summary of Data                                               
No. units: 6    List of units: [1, 2, 3, 4, 5, a]
No. measurements: 5    List of measurements: [0, 1, 2, 3, 4]
No. interventions: 5    List of interventions: [0, 2, 3, 1, 4]
No. outcomes: 2    List of outcomes: [outcome_1, outcome_2]
                           Units under Measurements                                                   
------------------------------------------------------------------------------
                                  outcome_1                                            
------------------------------------------------------------------------------
Measurement 0: 6 measurements    List of measurements: [1, 2, 3, 4, 5, a]
Measurement 1: 6 measurements    List of measurements: [1, 2, 3, 4, 5, a]
Measurement 2: 6 measurements    List of measurements: [1, 2, 3, 4, 5, a]
Measurement 3: 6 measurements    List of measurements: [1, 2, 3, 4, 5, a]
Measurement 4: 6 measuremen

In [12]:
tensor.print_table("unit", "measurement", 0, 'outcome_1')

Under Outcome outcome_1 and Intervention 0 (X axis: unit, Y axis: measurement)
+---+---+---+---+---+---+---+
|   | 1 | 2 | 3 | 4 | a | 5 |
+---+---+---+---+---+---+---+
| 0 | * | * | * | * | * |   |
| 1 | * | * | * | * | * |   |
| 2 | * | * | * | * | * |   |
| 3 |   |   |   |   |   |   |
| 4 |   |   |   |   |   |   |
+---+---+---+---+---+---+---+


In [13]:
tensor.print_table("measurement", "intervention", 'a', 'outcome_1')

Under Outcome outcome_1 and Unit a (X axis: measurement, Y axis: intervention)
+---+---+---+---+---+---+
|   | 0 | 1 | 2 | 3 | 4 |
+---+---+---+---+---+---+
| 0 | * | * | * |   |   |
| 1 |   |   |   | * | * |
| 2 |   |   |   |   |   |
| 3 |   |   |   |   |   |
| 4 |   |   |   |   |   |
+---+---+---+---+---+---+


# Error handling

## 1. if filename is incorrect/file does not exist
show error message "data/sample_policing_inputsss.xlsx file not found. Please check and try again."

In [14]:
# file not exists
filename = "data/sample_policing_inputsss.xlsx"
df, id_to_covariates = preprocessing.utils.preprocessing_utils.import_excel(filename)

ValueError: data/sample_policing_inputsss.xlsx file not found. Please check and try again.

## 2. if data worksheet is missing or named incorrectly
show error message "Data worksheet not found in file. Please refer to https://github.mit.edu/xyhan/si-library/blob/master/README.md#preprocessing."

In [15]:
# Data worksheet not in file
filename = "data/sample_no_data.xlsx"
df, id_to_covariates = preprocessing.utils.preprocessing_utils.import_excel(filename)

ValueError: Worksheet data not found in file. Please refer to https://github.com/AnonTendim/si-library#readme.

## 3. if a column X is missing or named incorrectly
show error message "Missing or incorrect naming of X column in Data worksheet. Please refer to https://github.mit.edu/xyhan/si-library/blob/master/README.md#preprocessing."

In [9]:
# time column named incorrectly
filename = "data/sample_incorrect_naming.xlsx"
df, id_to_covariates = preprocessing.utils.preprocessing_utils.import_excel(filename)

ValueError: Missing or incorrect naming of time column in Data worksheet. Please refer to https://github.mit.edu/xyhan/si-library/blob/master/README.md#preprocessing.

## 4. if non-numeric values found in outcome columns
show error message "Non-numeric data found in column force of Data. Please refer to https://github.mit.edu/xyhan/si-library/blob/master/README.md#preprocessing."

Note that unit/time/intervention units can hold integer/string/float data but outcome columns can only hold numeric data.

In [10]:
# string data in outcomes
filename = "data/sample_string_outcome.xlsx"
df, id_to_covariates = preprocessing.utils.preprocessing_utils.import_excel(filename)

ValueError: Non-numeric data found in column force of Data. Please refer to https://github.mit.edu/xyhan/si-library/blob/master/README.md#preprocessing.