In [1]:
import sqlite3
import pandas as pd
import json

In [2]:
with open('nested_data.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

In [3]:
# Create dataframes
programs = []
concerts = []
works = []
soloists = []

for program in data['programs']:
    programs.append({
        'program_id': program['programID'],
        'season': program['season'],
        'orchestra': program['orchestra']
    })
    
    for concert in program['concerts']:
        concerts.append({
            'program_id': program['programID'],
            'date': concert['Date'],
            'event_type': concert['eventType'],
            'venue': concert['Venue'],
            'location': concert['Location'],
            'time': concert['Time']
        })
    
    for work in program['works']:
        works.append({
            'work_id': work['ID'],
            'program_id': program['programID'],
            'work_title': work.get('workTitle', ''),
            'composer_name': work.get('composerName', ''),
            'conductor_name': work.get('conductorName', ''),
            'movement': work.get('movement', ''),
            'interval': work.get('interval', '')
        })
        
        for soloist in work.get('soloists', []):
            soloists.append({
                'work_id': work['ID'],
                'soloist_name': soloist['soloistName'],
                'soloist_roles': soloist['soloistRoles'],
                'soloist_instrument': soloist['soloistInstrument']
            })

In [4]:
programs_df = pd.DataFrame(programs).drop_duplicates(subset='program_id')
concerts_df = pd.DataFrame(concerts)
works_df = pd.DataFrame(works).drop_duplicates(subset='work_id')
soloists_df = pd.DataFrame(soloists)

In [5]:
for col in works_df.columns:
    works_df[col] = works_df[col].astype(str)

In [6]:
conn = sqlite3.connect('orchestra.db')

In [7]:
# Create the tables
cur = conn.cursor()
cur.execute("""
CREATE TABLE IF NOT EXISTS programs (
    program_id TEXT PRIMARY KEY,
    season TEXT,
    orchestra TEXT
)
""")

<sqlite3.Cursor at 0x1c6742a1140>

In [8]:
cur.execute("""
CREATE TABLE IF NOT EXISTS concerts (
    concert_id INTEGER PRIMARY KEY AUTOINCREMENT,
    program_id TEXT,
    date TIMESTAMP,
    event_type TEXT,
    venue TEXT,
    location TEXT,
    time TEXT,
    FOREIGN KEY (program_id) REFERENCES programs(program_id)
)
""")

<sqlite3.Cursor at 0x1c6742a1140>

In [9]:

cur.execute("""
CREATE TABLE IF NOT EXISTS works (
    work_id TEXT PRIMARY KEY,
    program_id TEXT,
    work_title TEXT,
    composer_name TEXT,
    conductor_name TEXT,
    movement TEXT,
    interval TEXT,
    FOREIGN KEY (program_id) REFERENCES programs(program_id)
)
""")

<sqlite3.Cursor at 0x1c6742a1140>

In [10]:
cur.execute("""
CREATE TABLE IF NOT EXISTS soloists (
    soloist_id INTEGER PRIMARY KEY AUTOINCREMENT,
    work_id TEXT,
    soloist_name TEXT,
    soloist_roles TEXT,
    soloist_instrument TEXT,
    FOREIGN KEY (work_id) REFERENCES works(work_id)
)
""")

<sqlite3.Cursor at 0x1c6742a1140>

In [11]:
def insert_data(df, table_name):
    columns = ', '.join(df.columns)
    placeholders = ', '.join(['?' for _ in df.columns])
    sql = f"INSERT OR IGNORE INTO {table_name} ({columns}) VALUES ({placeholders})"
    
    data = [tuple(row) for row in df.itertuples(index=False, name=None)]

    with sqlite3.connect('orchestra.db') as conn:
        cur = conn.cursor()
        cur.executemany(sql, data)
        conn.commit()

In [12]:
insert_data(programs_df, 'programs')
insert_data(concerts_df, 'concerts')
insert_data(works_df, 'works')
insert_data(soloists_df, 'soloists')

In [13]:
for table in ['programs', 'concerts', 'works', 'soloists']:
    print(f"\nData from table {table}:")
    cur.execute(f"SELECT * FROM {table} LIMIT 5")
    rows = cur.fetchall()
    for row in rows:
        print(row)


Data from table programs:
('3853', '1842-43', 'New York Philharmonic')
('5178', '1842-43', 'New York Philharmonic')
('10785', '1842-43', 'Musicians from the New York Philharmonic')
('5887', '1842-43', 'New York Philharmonic')
('305', '1843-44', 'New York Philharmonic')

Data from table concerts:
(1, '3853', '1842-12-07T05:00:00Z', 'Subscription Season', 'Apollo Rooms', 'Manhattan, NY', '8:00PM')
(2, '5178', '1843-02-18T05:00:00Z', 'Subscription Season', 'Apollo Rooms', 'Manhattan, NY', '8:00PM')
(3, '10785', '1843-04-07T05:00:00Z', 'Special', 'Apollo Rooms', 'Manhattan, NY', '8:00PM')
(4, '5887', '1843-04-22T05:00:00Z', 'Subscription Season', 'Apollo Rooms', 'Manhattan, NY', '8:00PM')
(5, '305', '1843-11-18T05:00:00Z', 'Subscription Season', 'Apollo Rooms', 'Manhattan, NY', 'None')

Data from table works:
('52446*', '3853', 'SYMPHONY NO. 5 IN C MINOR, OP.67', 'Beethoven,  Ludwig  van', 'Hill, Ureli Corelli', '', '')
('8834*4', '3853', 'OBERON', 'Weber,  Carl  Maria Von', 'Timm, Henry 

In [14]:
import sqlite3
import pandas as pd

In [15]:
conn = sqlite3.connect('orchestra.db')
programs_df = pd.read_sql_query("SELECT * FROM programs", conn)
concerts_df = pd.read_sql_query("SELECT * FROM concerts", conn)
works_df = pd.read_sql_query("SELECT * FROM works", conn)
soloists_df = pd.read_sql_query("SELECT * FROM soloists", conn)

In [16]:
print("Programs DataFrame missing values:")
print(programs_df.isnull().sum())

print("\nConcerts DataFrame missing values:")
print(concerts_df.isnull().sum())

print("\nWorks DataFrame missing values:")
print(works_df.isnull().sum())

print("\nSoloists DataFrame missing values:")
print(soloists_df.isnull().sum())

Programs DataFrame missing values:
program_id    0
season        0
orchestra     0
dtype: int64

Concerts DataFrame missing values:
concert_id    0
program_id    0
date          0
event_type    0
venue         0
location      0
time          0
dtype: int64

Works DataFrame missing values:
work_id           0
program_id        0
work_title        0
composer_name     0
conductor_name    0
movement          0
interval          0
dtype: int64

Soloists DataFrame missing values:
soloist_id              0
work_id                 0
soloist_name           66
soloist_roles          80
soloist_instrument    105
dtype: int64
