# Import DROID File data into SQLite DB

In [5]:
from sqlalchemy import create_engine
from sqlalchemy import Table, Column, Integer, String, DateTime, Date, MetaData

In [6]:
engine = create_engine("sqlite:///test.db", echo=True)

In [7]:
metadata = MetaData()

In [8]:
droid_ids = Table('droid_ids', metadata,
                Column('id', Integer, primary_key=True),
                 Column('parent_id', Integer),
                 Column('uri', String),
                 Column('file_path', String),
                 Column('filename', String),
                 Column('id_method', String),
                 Column('status', String),
                 Column('size', Integer),
                 Column('type', String),
                 Column('file_extension', String),
                 Column('last_modified', DateTime),
                 Column('ext_mis_warning', String),
                 Column('hash', String),
                 Column('file_format_count', Integer),
                 Column('pronom_id', String),
                 Column('mime_type', String),
                 Column('file_format_name', String),
                 Column('file_format_version', String),
                 Column('project_name', String))

metadata.create_all(engine)

2021-06-23 09:43:40,758 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2021-06-23 09:43:40,759 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("droid_ids")
2021-06-23 09:43:40,759 INFO sqlalchemy.engine.Engine [raw sql] ()
2021-06-23 09:43:40,760 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("droid_ids")
2021-06-23 09:43:40,760 INFO sqlalchemy.engine.Engine [raw sql] ()
2021-06-23 09:43:40,761 INFO sqlalchemy.engine.Engine 
CREATE TABLE droid_ids (
	id INTEGER NOT NULL, 
	parent_id INTEGER, 
	uri VARCHAR, 
	file_path VARCHAR, 
	filename VARCHAR, 
	id_method VARCHAR, 
	status VARCHAR, 
	size INTEGER, 
	type VARCHAR, 
	file_extension VARCHAR, 
	last_modified DATETIME, 
	hash VARCHAR, 
	file_format_count INTEGER, 
	pronom_id VARCHAR, 
	mime_type VARCHAR, 
	file_format_name VARCHAR, 
	file_format_version VARCHAR, 
	project_name VARCHAR, 
	PRIMARY KEY (id)
)


2021-06-23 09:43:40,762 INFO sqlalchemy.engine.Engine [no key 0.00039s] ()
2021-06-23 09:43:40,901 INFO sqlalchemy.engin

In [9]:
import csv
import datetime

skyrates_droid_csv = 'Skyrates_DROID_Analysis_20191009.csv'
jukebox_droid_csv = 'Jukebox_DROID_Analysis_20191009.csv'
cert_droid_csv = 'CERT_DROID_Analysis_20191009.csv'
granny_droid_csv = 'ETC_Granny_DROID_Analysis_20191211.csv'

droid_headers = ["id","parent_id","uri",
                 "file_path","filename","id_method",
                 "status","size","type","file_extension",
                 "last_modified","ext_mis_warning",
                 "hash","file_format_count","pronom_id",
                 "mime_type","file_format_name","file_format_version"]
droid_int_headers = ["id", "parent_id", "size"]
droid_date_headers = ["last_modified"]

In [10]:
def map_droid_dict_values(row_dict):
    new_row_dict = {}
    for k in row_dict.keys():
        if k in droid_int_headers:
            new_row_dict[k] = int(row_dict[k]) if row_dict[k] else 0
        elif k in droid_date_headers:
            new_row_dict[k] = datetime.datetime.strptime(row_dict[k], '%Y-%m-%dT%H:%M:%S') if row_dict[k] else datetime.datetime.today()
        else:
            new_row_dict[k] = row_dict[k]
    return new_row_dict
                
def insert_dict_list(csv_file, project_name):
    insert_list = []
    with open(csv_file, 'r') as f:
        first_row = True
        dict_reader = csv.DictReader(f, fieldnames=droid_headers)
        for row in dict_reader:
            if not first_row:
                insert_list.append(map_droid_dict_values(row))
            first_row = False
    for x in insert_list:
        x['project_name'] = project_name
    return insert_list

def max_in_dict_list(ins_list):
    return max([ x['id'] for x in ins_list])

def modify_ids(ins_list, max_start=0):
    max_id = max_in_dict_list(ins_list) + max_start
    for d in ins_list:
        d['id'] = d['id'] + max_id
        d['parent_id'] = d['parent_id'] + max_id
    return ins_list

In [11]:

droid_ids_ins_list = []
ins = droid_ids.insert()

skyrates_dict_list = insert_dict_list(skyrates_droid_csv, 'skyrates')
                 

In [12]:
skyrates_dict_list[0]

{'id': 2,
 'parent_id': 0,
 'uri': 'file:/Users/ekaltman/Desktop/2006_semester_3/Skyrates/',
 'file_path': '/Users/ekaltman/Desktop/2006_semester_3/Skyrates',
 'filename': 'Skyrates',
 'id_method': '',
 'status': 'Done',
 'size': 0,
 'type': 'Folder',
 'file_extension': '',
 'last_modified': datetime.datetime(2019, 9, 20, 13, 44, 19),
 'hash': '',
 'file_format_count': '',
 'pronom_id': '',
 'mime_type': '',
 'file_format_name': '',
 'file_format_version': '',
 'project_name': 'skyrates'}

In [13]:
jukebox_dict_list = modify_ids(insert_dict_list(jukebox_droid_csv, 'jukebox'), max_start=max_in_dict_list(skyrates_dict_list))

In [14]:
jukebox_dict_list[0]

{'id': 27137,
 'parent_id': 27135,
 'uri': 'file:/Users/ekaltman/Desktop/2006_semester_3/Jukebox/',
 'file_path': '/Users/ekaltman/Desktop/2006_semester_3/Jukebox',
 'filename': 'Jukebox',
 'id_method': '',
 'status': 'Done',
 'size': 0,
 'type': 'Folder',
 'file_extension': '',
 'last_modified': datetime.datetime(2019, 9, 30, 12, 35, 27),
 'hash': '',
 'file_format_count': '',
 'pronom_id': '',
 'mime_type': '',
 'file_format_name': '',
 'file_format_version': '',
 'project_name': 'jukebox'}

In [15]:
cert_dict_list = modify_ids(insert_dict_list(cert_droid_csv, 'cert'), max_start=max_in_dict_list(jukebox_dict_list))

In [16]:
granny_dict_list = modify_ids(insert_dict_list(granny_droid_csv, 'granny'), max_start=max_in_dict_list(cert_dict_list))

In [17]:
droid_ids_ins_list.extend(skyrates_dict_list)
droid_ids_ins_list.extend(jukebox_dict_list)
droid_ids_ins_list.extend(cert_dict_list)     
droid_ids_ins_list.extend(granny_dict_list)

In [18]:
len(droid_ids_ins_list)

142658

In [19]:
conn = engine.connect()

In [20]:
conn.execute(ins, droid_ids_ins_list[:30000])

2021-06-23 09:43:48,884 INFO sqlalchemy.engine.Engine [generated in 0.34679s] ((2, 0, 'file:/Users/ekaltman/Desktop/2006_semester_3/Skyrates/', '/Users/ekaltman/Desktop/2006_semester_3/Skyrates', 'Skyrates', '', 'Done', 0, 'Folder', '', '2019-09-20 13:44:19.000000', 'false', '', '', '', '', '', '', 'skyrates'), (4, 2, 'file:/Users/ekaltman/Desktop/2006_semester_3/Skyrates/ChrisArt/', '/Users/ekaltman/Desktop/2006_semester_3/Skyrates/ChrisArt', 'ChrisArt', '', 'Done', 0, 'Folder', '', '2013-01-16 12:15:56.000000', 'false', '', '', '', '', '', '', 'skyrates'), (6, 4, 'file:/Users/ekaltman/Desktop/2006_semester_3/Skyrates/ChrisArt/Thumbs.db', '/Users/ekaltman/Desktop/2006_semester_3/Skyrates/ChrisArt/Thumbs.db', 'Thumbs.db', 'Signature', 'Done', 70656, 'File', 'db', '2013-01-16 12:16:04.000000', 'false', '60e3ebf7b2b24576d2f6adeafd60b284', '1', 'fmt/111', '', 'OLE2 Compound Document Format', '', 'skyrates'), (75, 4, 'file:/Users/ekaltman/Desktop/2006_semester_3/Skyrates/ChrisArt/bear.png'

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x28f9e985520>

In [21]:
conn.execute(ins, droid_ids_ins_list[30000:60000])

2021-06-23 09:43:50,326 INFO sqlalchemy.engine.Engine [cached since 1.79s ago] ((152475, 152355, 'file:/D:/2006_semester_3/Granny/Granny/Animation/Shots/01/Frames/Shot_01.206.png', 'D:\\2006_semester_3\\Granny\\Granny\\Animation\\Shots\\01\\Frames\\Shot_01.206.png', 'Shot_01.206.png', 'Signature', 'Done', 218583, 'File', 'png', '2006-12-05 20:38:01.000000', 'false', '', '1', 'fmt/11', 'image/png', 'Portable Network Graphics', '1.0', 'granny'), (152476, 152355, 'file:/D:/2006_semester_3/Granny/Granny/Animation/Shots/01/Frames/Shot_01.207.png', 'D:\\2006_semester_3\\Granny\\Granny\\Animation\\Shots\\01\\Frames\\Shot_01.207.png', 'Shot_01.207.png', 'Signature', 'Done', 218583, 'File', 'png', '2006-12-05 20:38:02.000000', 'false', '', '1', 'fmt/11', 'image/png', 'Portable Network Graphics', '1.0', 'granny'), (152477, 152355, 'file:/D:/2006_semester_3/Granny/Granny/Animation/Shots/01/Frames/Shot_01.208.png', 'D:\\2006_semester_3\\Granny\\Granny\\Animation\\Shots\\01\\Frames\\Shot_01.208.png

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x28fae4b8ee0>

In [22]:
conn.execute(ins, droid_ids_ins_list[60000:90000])

2021-06-23 09:43:51,307 INFO sqlalchemy.engine.Engine [cached since 2.77s ago] ((182475, 182397, 'file:/D:/2006_semester_3/Granny/Granny/From%20Locksmith-Muster/MayaProject/particles/Shot_76-sim/heroBodyCementParticlesShape.25750.pdc', 'D:\\2006_semester_3\\Granny\\Granny\\From Locksmith-Muster\\MayaProject\\particles\\Shot_76-sim\\heroBodyCementParticlesShape.25750.pdc', 'heroBodyCementParticlesShape.25750.pdc', '', 'Done', 3105134, 'File', 'pdc', '2006-11-02 18:31:32.000000', 'false', '', '0', '', '', '', '', 'granny'), (182476, 182397, 'file:/D:/2006_semester_3/Granny/Granny/From%20Locksmith-Muster/MayaProject/particles/Shot_76-sim/heroBodyCementParticlesShape.26000.pdc', 'D:\\2006_semester_3\\Granny\\Granny\\From Locksmith-Muster\\MayaProject\\particles\\Shot_76-sim\\heroBodyCementParticlesShape.26000.pdc', 'heroBodyCementParticlesShape.26000.pdc', '', 'Done', 3105134, 'File', 'pdc', '2006-11-02 18:31:39.000000', 'false', '', '0', '', '', '', '', 'granny'), (182477, 182397, 'file:/

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x28fa116fa90>

In [23]:
conn.execute(ins, droid_ids_ins_list[90000:120000])

2021-06-23 09:43:52,389 INFO sqlalchemy.engine.Engine [cached since 3.852s ago] ((212476, 212412, 'file:/D:/2006_semester_3/Granny/Granny/MayaProject/particles/Shot_71-sim/heroBodyCementDrippyShape.2750.pdc', 'D:\\2006_semester_3\\Granny\\Granny\\MayaProject\\particles\\Shot_71-sim\\heroBodyCementDrippyShape.2750.pdc', 'heroBodyCementDrippyShape.2750.pdc', '', 'Done', 439780, 'File', 'pdc', '2006-12-11 13:29:59.000000', 'false', '', '0', '', '', '', '', 'granny'), (212477, 212412, 'file:/D:/2006_semester_3/Granny/Granny/MayaProject/particles/Shot_71-sim/heroBodyCementDrippyShape.3000.pdc', 'D:\\2006_semester_3\\Granny\\Granny\\MayaProject\\particles\\Shot_71-sim\\heroBodyCementDrippyShape.3000.pdc', 'heroBodyCementDrippyShape.3000.pdc', '', 'Done', 441700, 'File', 'pdc', '2006-12-11 13:30:00.000000', 'false', '', '0', '', '', '', '', 'granny'), (212478, 212412, 'file:/D:/2006_semester_3/Granny/Granny/MayaProject/particles/Shot_71-sim/heroBodyCementDrippyShape.3250.pdc', 'D:\\2006_semes

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x28f9e36ea30>

In [24]:
conn.execute(ins, droid_ids_ins_list[120000:])

2021-06-23 09:43:53,331 INFO sqlalchemy.engine.Engine [cached since 4.794s ago] ((242475, 242384, 'file:/D:/2006_semester_3/Granny/Granny/Output/Play%20Blasts/HeroCementDrying/HeroCementDrying.0089.iff', 'D:\\2006_semester_3\\Granny\\Granny\\Output\\Play Blasts\\HeroCementDrying\\HeroCementDrying.0089.iff', 'HeroCementDrying.0089.iff', 'Signature', 'Done', 1230216, 'File', 'iff', '2006-11-03 12:37:45.000000', 'false', '', '1', 'fmt/1169', '', 'Maya IFF Image File', '', 'granny'), (242476, 242384, 'file:/D:/2006_semester_3/Granny/Granny/Output/Play%20Blasts/HeroCementDrying/HeroCementDrying.0090.iff', 'D:\\2006_semester_3\\Granny\\Granny\\Output\\Play Blasts\\HeroCementDrying\\HeroCementDrying.0090.iff', 'HeroCementDrying.0090.iff', 'Signature', 'Done', 1230216, 'File', 'iff', '2006-11-03 12:37:47.000000', 'false', '', '1', 'fmt/1169', '', 'Maya IFF Image File', '', 'granny'), (242477, 242384, 'file:/D:/2006_semester_3/Granny/Granny/Output/Play%20Blasts/HeroCementDrying/HeroCementDrying

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x28f9e911760>

## Gather Common Statistics on the Project Data

In [25]:
from sqlalchemy.sql import select, text

In [26]:
s = text('select count(*) from droid_ids')

In [27]:
conn.execute(s).fetchall()

2021-06-23 09:43:55,555 INFO sqlalchemy.engine.Engine select count(*) from droid_ids
2021-06-23 09:43:55,556 INFO sqlalchemy.engine.Engine [generated in 0.00066s] ()


[(142658,)]

In [33]:
s = text('select count(distinct file_format_name) from droid_ids')

In [34]:
conn.execute(s).fetchall()

2021-06-23 09:46:19,302 INFO sqlalchemy.engine.Engine select count(distinct file_format_name) from droid_ids
2021-06-23 09:46:19,302 INFO sqlalchemy.engine.Engine [cached since 142.3s ago] ()


[(134,)]

In [36]:
s = text('select count(*) from droid_ids where file_extension = "png" and hash = ""')
conn.execute(s).fetchall()

In [39]:
s = text('select * from droid_ids where file_extension = "png" and hash = "" limit 20')
conn.execute(s).fetchall()

2021-06-23 09:51:07,600 INFO sqlalchemy.engine.Engine select * from droid_ids where file_extension = "png" and hash = "" limit 20
2021-06-23 09:51:07,600 INFO sqlalchemy.engine.Engine [cached since 71.35s ago] ()


[(152023, 151988, 'file:/D:/2006_semester_3/Granny/Granny/Administrative/Presentation%20Material/Media/Images/buildings.png', 'D:\\2006_semester_3\\Granny\\Granny\\Administrative\\Presentation Material\\Media\\Images\\buildings.png', 'buildings.png', 'Signature', 'Done', 1107695, 'File', 'png', '2006-08-30 17:37:10.000000', 'false', '', 1, 'fmt/11', 'image/png', 'Portable Network Graphics', '1.0', 'granny'),
 (152024, 151988, 'file:/D:/2006_semester_3/Granny/Granny/Administrative/Presentation%20Material/Media/Images/chair.png', 'D:\\2006_semester_3\\Granny\\Granny\\Administrative\\Presentation Material\\Media\\Images\\chair.png', 'chair.png', 'Signature', 'Done', 1742437, 'File', 'png', '2006-09-21 14:09:33.000000', 'false', '', 1, 'fmt/11', 'image/png', 'Portable Network Graphics', '1.0', 'granny'),
 (152026, 151988, 'file:/D:/2006_semester_3/Granny/Granny/Administrative/Presentation%20Material/Media/Images/grass.png', 'D:\\2006_semester_3\\Granny\\Granny\\Administrative\\Presentation

select file_format_name, count(*) from droid_ids group by file_format_name order by count(*) desc limit 20;

select file_format_name, file_format_version, file_extension, sum(size) from droid_ids where project_name in ('granny','skyrates','cert','jukebox') group by file_format_name, file_format_version order by sum(size) desc limit 20;


