The first order of business is to match the columns of a given `csv` file to the column names stored in a data dictionary.

In [1]:
import pandas as pd

In [2]:
import os

In [3]:
import json

In [4]:
import numpy as np

In [27]:
from pandas.core.arrays.floating import Float64Dtype
from pandas.core.arrays.integer import Int64Dtype
from pandas.core.arrays.string_ import StringDtype

In [5]:
os.getcwd()

'/home/trainingmontage/dev/projects/barrios'

In [13]:
table_name = None

In [6]:
df = pd.read_csv('./iss-data/csv/stored_items_only_inventory_mgmt_system_consumables_20220101-20230905.csv')

In [7]:
dfcols = df.columns.to_numpy()

In [8]:
with open('./src/lib/data_dictionary.json') as json_file:
    data_dictionary = json.load(json_file)

In [14]:
for key in data_dictionary.keys():
    np_arr = np.asarray(data_dictionary[key])
    matches = np.array_equal(np_arr, dfcols)
    if matches == True:
        print(f'CSV file matches {key}')
        table_name = key

CSV file matches inventory_mgmt_system_consumables


In [15]:
table_name

'inventory_mgmt_system_consumables'

Okay, we've successfully matched column names. Now we need to find a way to figure out the types of each column. I'm guessing there's a good way to get that information from pandas.

In [16]:
df.dtypes

datedim                      object
id                            int64
id_parent                     int64
id_path                      object
tree_depth                    int64
tree                         object
part_number                  object
serial_number                object
location_name                object
original_ip_owner            object
current_ip_owner             object
operational_nomenclature     object
russian_name                 object
english_name                 object
barcode                      object
quantity                      int64
width                       float64
height                      float64
length                      float64
diameter                    float64
calculated_volume           float64
stwg_ovrrd_vol              float64
children_volume             float64
stwg_ovrrd_chldrn_vol       float64
ovrrd_notes                 float64
volume_notes                 object
expire_date                  object
launch                      

Welp, that was simple enough. Now let's try to convert from Pandas `dtype` to regular Python types

In [20]:
dfn = df.convert_dtypes()
dfn.dtypes
type_list = dfn.dtypes.to_list()

In [21]:
type_list

[string[python],
 Int64Dtype(),
 Int64Dtype(),
 string[python],
 Int64Dtype(),
 string[python],
 string[python],
 string[python],
 string[python],
 string[python],
 string[python],
 string[python],
 string[python],
 string[python],
 string[python],
 Int64Dtype(),
 Float64Dtype(),
 Float64Dtype(),
 Float64Dtype(),
 Float64Dtype(),
 Float64Dtype(),
 Int64Dtype(),
 Float64Dtype(),
 Int64Dtype(),
 Int64Dtype(),
 string[python],
 string[python],
 string[python],
 string[python],
 string[python],
 string[python],
 string[python],
 Int64Dtype(),
 Int64Dtype(),
 string[python],
 string[python],
 string[python],
 string[python],
 string[python],
 Int64Dtype(),
 string[python]]

In [22]:
for entry in type_list:
    print(type(entry))

<class 'pandas.core.arrays.string_.StringDtype'>
<class 'pandas.core.arrays.integer.Int64Dtype'>
<class 'pandas.core.arrays.integer.Int64Dtype'>
<class 'pandas.core.arrays.string_.StringDtype'>
<class 'pandas.core.arrays.integer.Int64Dtype'>
<class 'pandas.core.arrays.string_.StringDtype'>
<class 'pandas.core.arrays.string_.StringDtype'>
<class 'pandas.core.arrays.string_.StringDtype'>
<class 'pandas.core.arrays.string_.StringDtype'>
<class 'pandas.core.arrays.string_.StringDtype'>
<class 'pandas.core.arrays.string_.StringDtype'>
<class 'pandas.core.arrays.string_.StringDtype'>
<class 'pandas.core.arrays.string_.StringDtype'>
<class 'pandas.core.arrays.string_.StringDtype'>
<class 'pandas.core.arrays.string_.StringDtype'>
<class 'pandas.core.arrays.integer.Int64Dtype'>
<class 'pandas.core.arrays.floating.Float64Dtype'>
<class 'pandas.core.arrays.floating.Float64Dtype'>
<class 'pandas.core.arrays.floating.Float64Dtype'>
<class 'pandas.core.arrays.floating.Float64Dtype'>
<class 'pandas.c

In [34]:
python_type_strings = []
db_type_strings = []
for entry in type_list:
    if isinstance(entry, StringDtype):
        python_type_strings.append('string')
        db_type_strings.append('text')
    if isinstance(entry, Int64Dtype):
        python_type_strings.append('int')
        db_type_strings.append('int')
    if isinstance(entry, Float64Dtype):
        python_type_strings.append('float')
        db_type_strings.append('float')

In [35]:
python_type_strings

['string',
 'int',
 'int',
 'string',
 'int',
 'string',
 'string',
 'string',
 'string',
 'string',
 'string',
 'string',
 'string',
 'string',
 'string',
 'int',
 'float',
 'float',
 'float',
 'float',
 'float',
 'int',
 'float',
 'int',
 'int',
 'string',
 'string',
 'string',
 'string',
 'string',
 'string',
 'string',
 'int',
 'int',
 'string',
 'string',
 'string',
 'string',
 'string',
 'int',
 'string']

In [36]:
db_type_strings

['text',
 'int',
 'int',
 'text',
 'int',
 'text',
 'text',
 'text',
 'text',
 'text',
 'text',
 'text',
 'text',
 'text',
 'text',
 'int',
 'float',
 'float',
 'float',
 'float',
 'float',
 'int',
 'float',
 'int',
 'int',
 'text',
 'text',
 'text',
 'text',
 'text',
 'text',
 'text',
 'int',
 'int',
 'text',
 'text',
 'text',
 'text',
 'text',
 'int',
 'text']