Generate Provenance JSON and Diagram

In [1]:
import prov.model as prov
from prov.dot import prov_to_dot
from datetime import datetime
from pathlib import Path

Initialize

In [2]:
doc = prov.ProvDocument()

doc.add_namespace('cfb', 'https://github.com/bryanthou/cs598-project#')
doc.add_namespace('kaggle', 'https://www.kaggle.com/datasets/')
doc.add_namespace('data.gov', 'https://catalog.data.gov/dataset/')
doc.add_namespace('cfb-api', 'https://collegefootballdata.com/')
doc.add_namespace('open-meteo', 'https://open-meteo.com/')
doc.add_namespace('file', 'file://')

<Namespace: file {file://}>

Source Data

In [3]:
kaggle_user = doc.agent('kaggle:cviaxmiwnptr',
                       {prov.PROV_TYPE: prov.PROV['Person'],
                        'prov:label': 'Kaggle User cviaxmiwnptr'})

us_dept_ed = doc.agent('cfb:us_dept_ed',
                      {prov.PROV_TYPE: prov.PROV['Organization'],
                       'prov:label': 'U.S. Department of Education'})

cfb_data_org = doc.agent('cfb-api:organization',
                        {prov.PROV_TYPE: prov.PROV['Organization'],
                         'prov:label': 'CollegeFootballData.com'})

open_meteo_org = doc.agent('open-meteo:organization',
                          {prov.PROV_TYPE: prov.PROV['Organization'],
                           'prov:label': 'Open-Meteo'})

cfb_kaggle_data = doc.entity('kaggle:college-football-team-stats',
                            {prov.PROV_TYPE: prov.PROV['Dataset'],
                             'prov:label': 'College Football Team Stats 2002-2024 (Kaggle)'})

school_locations_data = doc.entity('data.gov:postsecondary-school-locations',
                                 {prov.PROV_TYPE: prov.PROV['Dataset'],
                                  'prov:label': 'Postsecondary School Locations 2020-21'})

cfb_api_data = doc.entity('cfb-api:venues',
                         {prov.PROV_TYPE: prov.PROV['Dataset'],
                          'prov:label': 'College Football Data API - Venues'})

weather_api_data = doc.entity('open-meteo:weather-api',
                             {prov.PROV_TYPE: prov.PROV['Service'],
                              'prov:label': 'Open-Meteo Weather API'})

doc.wasAttributedTo(cfb_kaggle_data, kaggle_user)
doc.wasAttributedTo(school_locations_data, us_dept_ed)
doc.wasAttributedTo(cfb_api_data, cfb_data_org)
doc.wasAttributedTo(weather_api_data, open_meteo_org)

<ProvAttribution: (open-meteo:weather-api, open-meteo:organization)>

Activities

In [4]:
fetch_data_activity = doc.activity('cfb:step1_fetch_data',
                                  datetime.now().isoformat(),
                                  None,
                                  {prov.PROV_TYPE: prov.PROV['Activity'],
                                   'prov:label': 'Step 1: Fetch Data'})

validate_data_activity = doc.activity('cfb:step2_validate_data',
                                    datetime.now().isoformat(),
                                    None,
                                    {prov.PROV_TYPE: prov.PROV['Activity'],
                                     'prov:label': 'Step 2: Validate Data'})

preprocess_matching_activity = doc.activity('cfb:step3_preprocess_matching',
                                          datetime.now().isoformat(),
                                          None,
                                          {prov.PROV_TYPE: prov.PROV['Activity'],
                                           'prov:label': 'Step 3: Preprocess School Matching'})

fetch_venue_locations_activity = doc.activity('cfb:step4_fetch_venues',
                                            datetime.now().isoformat(),
                                            None,
                                            {prov.PROV_TYPE: prov.PROV['Activity'],
                                             'prov:label': 'Step 4: Fetch Venue Locations'})

combine_locations_activity = doc.activity('cfb:step5_combine_locations',
                                        datetime.now().isoformat(),
                                        None,
                                        {prov.PROV_TYPE: prov.PROV['Activity'],
                                         'prov:label': 'Step 5: Combine CFB and Locations'})

enrich_weather_activity = doc.activity('cfb:step6_enrich_weather',
                                     datetime.now().isoformat(),
                                     None,
                                     {prov.PROV_TYPE: prov.PROV['Activity'],
                                      'prov:label': 'Step 6: Enrich Weather Data'})

enrich_school_weather_activity = doc.activity('cfb:step7_enrich_school_weather',
                                            datetime.now().isoformat(),
                                            None,
                                            {prov.PROV_TYPE: prov.PROV['Activity'],
                                             'prov:label': 'Step 7: Enrich School Weather Data'})

generate_codebooks_activity = doc.activity('cfb:step8_generate_codebooks',
                                         datetime.now().isoformat(),
                                         None,
                                         {prov.PROV_TYPE: prov.PROV['Activity'],
                                          'prov:label': 'Step 8: Generate Codebooks'})

analyze_data_activity = doc.activity('cfb:step9_analyze_data',
                                   datetime.now().isoformat(),
                                   None,
                                   {prov.PROV_TYPE: prov.PROV['Activity'],
                                    'prov:label': 'Step 9: Analyze Data'})

activities = [fetch_data_activity, validate_data_activity, preprocess_matching_activity,
              fetch_venue_locations_activity, combine_locations_activity, enrich_weather_activity,
              enrich_school_weather_activity, generate_codebooks_activity, analyze_data_activity]

Intermediate Files

In [5]:
cfb_raw_data = doc.entity('file:cfb_box-scores_2002-2024.csv',
                         {prov.PROV_TYPE: prov.PROV['Dataset'],
                          'prov:label': 'Raw CFB Box Scores Data'})

school_locations_raw = doc.entity('file:postsecondary_school_locations_2020-21.csv',
                                 {prov.PROV_TYPE: prov.PROV['Dataset'],
                                  'prov:label': 'Raw School Locations Data'})

cfb_validated = doc.entity('file:cfb_validation_results.csv',
                          {prov.PROV_TYPE: prov.PROV['Dataset'],
                           'prov:label': 'Validated CFB Data'})

llm_matched_schools = doc.entity('file:llm_matched_schools.csv',
                                {prov.PROV_TYPE: prov.PROV['Dataset'],
                                 'prov:label': 'LLM Matched Schools'})

venue_locations = doc.entity('file:collegefootballdata_venues.csv',
                            {prov.PROV_TYPE: prov.PROV['Dataset'],
                             'prov:label': 'Venue Locations Data'})

cfb_with_locations = doc.entity('file:cfb_with_locations.csv',
                               {prov.PROV_TYPE: prov.PROV['Dataset'],
                                'prov:label': 'CFB Data with Locations'})

cfb_with_weather = doc.entity('file:cfb_with_weather.csv',
                             {prov.PROV_TYPE: prov.PROV['Dataset'],
                              'prov:label': 'CFB Data with Weather'})

schools_with_weather = doc.entity('file:schools_with_weather.csv',
                                 {prov.PROV_TYPE: prov.PROV['Dataset'],
                                  'prov:label': 'Schools with Weather Data'})

codebooks = doc.entity('cfb:codebooks',
                      {prov.PROV_TYPE: prov.PROV['Collection'],
                       'prov:label': 'Generated Codebooks'})

analysis_results = doc.entity('file:weather_analysis_results.txt',
                             {prov.PROV_TYPE: prov.PROV['Dataset'],
                              'prov:label': 'Weather Analysis Results'})

Relationships

In [6]:
# generated
doc.wasGeneratedBy(cfb_raw_data, fetch_data_activity)
doc.wasGeneratedBy(school_locations_raw, fetch_data_activity)
doc.wasGeneratedBy(cfb_validated, validate_data_activity)
doc.wasGeneratedBy(llm_matched_schools, preprocess_matching_activity)
doc.wasGeneratedBy(venue_locations, fetch_venue_locations_activity)
doc.wasGeneratedBy(cfb_with_locations, combine_locations_activity)
doc.wasGeneratedBy(cfb_with_weather, enrich_weather_activity)
doc.wasGeneratedBy(schools_with_weather, enrich_school_weather_activity)
doc.wasGeneratedBy(codebooks, generate_codebooks_activity)
doc.wasGeneratedBy(analysis_results, analyze_data_activity)

# used
doc.used(fetch_data_activity, cfb_kaggle_data)
doc.used(fetch_data_activity, school_locations_data)
doc.used(validate_data_activity, cfb_raw_data)
doc.used(preprocess_matching_activity, cfb_raw_data)
doc.used(preprocess_matching_activity, school_locations_raw)
doc.used(fetch_venue_locations_activity, cfb_api_data)
doc.used(combine_locations_activity, cfb_raw_data)
doc.used(combine_locations_activity, llm_matched_schools)
doc.used(combine_locations_activity, venue_locations)
doc.used(enrich_weather_activity, cfb_with_locations)
doc.used(enrich_weather_activity, weather_api_data)
doc.used(enrich_school_weather_activity, llm_matched_schools)
doc.used(enrich_school_weather_activity, weather_api_data)
doc.used(generate_codebooks_activity, cfb_with_weather)
doc.used(generate_codebooks_activity, schools_with_weather)
doc.used(analyze_data_activity, cfb_with_weather)

# derived from
doc.wasDerivedFrom(cfb_validated, cfb_raw_data)
doc.wasDerivedFrom(llm_matched_schools, cfb_raw_data)
doc.wasDerivedFrom(llm_matched_schools, school_locations_raw)
doc.wasDerivedFrom(cfb_with_locations, cfb_raw_data)
doc.wasDerivedFrom(cfb_with_locations, llm_matched_schools)
doc.wasDerivedFrom(cfb_with_locations, venue_locations)
doc.wasDerivedFrom(cfb_with_weather, cfb_with_locations)
doc.wasDerivedFrom(schools_with_weather, llm_matched_schools)

<ProvDerivation: (file:schools_with_weather.csv, file:llm_matched_schools.csv)>

In [7]:
output_dir = Path.cwd().parent.parent / 'output'
output_dir.mkdir(exist_ok=True)

json_file = output_dir / 'provenance.json'
with open(json_file, 'w') as f:
    f.write(doc.serialize(format='json'))
print("Provenance JSON saved to: " + str(json_file))

dot = prov_to_dot(doc)
png_file = output_dir / 'provenance_diagram.png'
dot.write_png(str(png_file))
print("Provenance diagram saved to: " + str(png_file))

Provenance JSON saved to: /Users/bryanthou/Desktop/CS598/cs598-project/cs598-project/output/provenance.json


Provenance diagram saved to: /Users/bryanthou/Desktop/CS598/cs598-project/cs598-project/output/provenance_diagram.png
