In [18]:
import csv
from datetime import datetime
import math  # For handling NaN checks

# Initialize counters for IDs
cause_id_counter = 1
date_id_counter = 1
weather_id_counter = 1
geography_id_counter = 1
crash_id_counter = 1
vehicle_id_counter = 1
fact_crash_id_counter = 1

# Initialize dictionaries for dimension tables
cause_dict = {}
date_dict = {}
weather_dict = {}
geography_dict = {}
crash_dict = {}
vehicle_dict = {}
person_id_set = set()

# Dictionaries to map RD_NO to various attributes
rd_no_to_num_units = {}
rd_no_to_participant_count = {}
rd_no_to_date_id = {}
rd_no_to_weather_id = {}
rd_no_to_geography_id = {}
rd_no_to_cause_id = {}
rd_no_to_crash_id = {}
rd_no_to_incident_severity = {}
rd_no_to_geography_details = {}  # Temporary storage for police_beat, latitude, longitude

# Process Crashes dataset
with open('C:/Users/hp/OneDrive/Desktop/LDS24Project/cleaned_Dataset/cleaned_Crashes.csv', 'r') as crashes_file:
    reader = csv.DictReader(crashes_file)
    for row in reader:
        rd_no = row['RD_NO']

        # Process Cause Dimension
        cause_desc = row['PRIM_CONTRIBUTORY_CAUSE']
        if cause_desc not in cause_dict:
            cause_dict[cause_desc] = cause_id_counter
            cause_id_counter += 1
        cause_id = cause_dict[cause_desc]
        rd_no_to_cause_id[rd_no] = cause_id

        # Process Date Dimension
        crash_date_str = row['CRASH_DATE']
        crash_date_obj = datetime.strptime(crash_date_str, '%m/%d/%Y %I:%M:%S %p')
        crash_date_only_str = crash_date_obj.strftime('%Y-%m-%d')
        time_of_day = crash_date_obj.strftime('%H:%M:%S')

        if crash_date_only_str not in date_dict:
            date_dict[crash_date_only_str] = (date_id_counter, time_of_day)
            date_id_counter += 1

        date_id = date_dict[crash_date_only_str][0]
        rd_no_to_date_id[rd_no] = date_id

        # Process Weather Dimension
        weather_condition = row['WEATHER_CONDITION'].strip()
        lighting_condition = row['LIGHTING_CONDITION'].strip()

        # Define visibility condition based on weather and lighting without changing case
        if lighting_condition == "DAYLIGHT" and weather_condition == "CLEAR":
            visibility_condition = "CLEAR"
        elif lighting_condition in ["DUSK", "DAWN", "DARKNESS, LIGHTED ROAD", "DARKNESS"] and weather_condition == "CLEAR":
            visibility_condition = "LOW LIGHT"
        elif weather_condition in ["FOG", "SMOKE", "HAZE"]:
            visibility_condition = "POOR (FOG, SMOKE)"
        elif weather_condition == "RAIN":
            visibility_condition = "POOR (RAIN)"
        elif weather_condition == "SNOW":
            visibility_condition = "POOR (SNOW)"
        else:
            visibility_condition = "UNKNOWN"  # For unclassified cases


        # Create a unique key to avoid duplicate entries in DimWeather
        weather_key = (weather_condition, visibility_condition)
        if weather_key not in weather_dict:
            weather_dict[weather_key] = weather_id_counter
            weather_id_counter += 1
        weather_id = weather_dict[weather_key]
        rd_no_to_weather_id[rd_no] = weather_id

        # Process Geography Dimension (without CITY and STATE at this stage)
        police_beat = row['BEAT_OF_OCCURRENCE']
        latitude = row['LATITUDE']
        longitude = row['LONGITUDE']
        # Temporarily store police_beat, latitude, longitude for joining with People data later
        rd_no_to_geography_details[rd_no] = (police_beat, latitude, longitude)

        # Process Crash Dimension
        crash_type = row['CRASH_TYPE']
        traffic_control_device = row['TRAFFIC_CONTROL_DEVICE']
        crash_key = (rd_no, crash_type, traffic_control_device, lighting_condition)

        if crash_key not in crash_dict:
            crash_dict[crash_key] = crash_id_counter
            crash_id_counter += 1
        crash_id = crash_dict[crash_key]
        rd_no_to_crash_id[rd_no] = crash_id

        # Map RD_NO to other attributes
        rd_no_to_num_units[rd_no] = row['NUM_UNITS']
        rd_no_to_participant_count[rd_no] = row['INJURIES_TOTAL']
        rd_no_to_incident_severity[rd_no] = row['MOST_SEVERE_INJURY']

# Process People dataset to fetch CITY and STATE for Geography Dimension
with open('C:/Users/hp/OneDrive/Desktop/LDS24Project/cleaned_Dataset/cleaned_People.csv', 'r') as people_file:
    reader = csv.DictReader(people_file)
    for row in reader:
        rd_no = row['RD_NO']
        city = row['CITY']
        state = row['STATE']
        
        # Retrieve temporary geography data from Crashes processing
        if rd_no in rd_no_to_geography_details:
            police_beat, latitude, longitude = rd_no_to_geography_details[rd_no]
            geography_key = (police_beat, city, state, latitude, longitude)

            if geography_key not in geography_dict:
                geography_dict[geography_key] = geography_id_counter
                geography_id_counter += 1
            geography_id = geography_dict[geography_key]
            rd_no_to_geography_id[rd_no] = geography_id

# Write DimGeography table
with open('DimGeography.csv', 'w', newline='') as dim_geography_file:
    geography_writer = csv.writer(dim_geography_file)
    geography_writer.writerow(['Geography_ID', 'Police_Beat', 'City', 'State', 'Latitude', 'Longitude'])
    for (police_beat, city, state, latitude, longitude), geography_id in geography_dict.items():
        geography_writer.writerow([geography_id, police_beat, city, state, latitude, longitude])

# Write DimCause table
with open('DimCause.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['Cause_ID', 'Cause_Description'])
    for cause_desc, cause_id in cause_dict.items():
        writer.writerow([cause_id, cause_desc])

# Write DimDate table
with open('DimDate.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['Date_ID', 'Date', 'Year', 'Quarter', 'Month', 'Day', 'Time_of_Day'])
    for date_str, (date_id, time_of_day) in date_dict.items():
        date_obj = datetime.strptime(date_str, '%Y-%m-%d')
        year = date_obj.year
        quarter = (date_obj.month - 1) // 3 + 1
        month = date_obj.month
        day = date_obj.day
        writer.writerow([date_id, date_str, year, quarter, month, day, time_of_day])

# Write DimWeather table
with open('DimWeather.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['Weather_ID', 'Weather_Condition', 'Visibility_Condition'])
    for (weather_condition, visibility_condition), weather_id in weather_dict.items():
        writer.writerow([weather_id, weather_condition, visibility_condition])

# Write DimCrash table
with open('DimCrash.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['Crash_ID', 'Crash_Type', 'Traffic_Control_Device', 'Lighting_Condition'])
    for (rd_no, crash_type, traffic_control_device, lighting_condition), crash_id in crash_dict.items():
        writer.writerow([crash_id, crash_type, traffic_control_device, lighting_condition])



# Write DimVehicle table
with open('DimVehicle.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['Vehicle_ID', 'Make', 'Model', 'Year', 'Defects', 'Vehicle_Type'])
    with open('C:/Users/hp/OneDrive/Desktop/LDS24Project/cleaned_Dataset/cleaned_Vehicles.csv', 'r') as vehicles_file:
        reader = csv.DictReader(vehicles_file)
        for row in reader:
            vehicle_id = row['VEHICLE_ID']
            if vehicle_id not in vehicle_dict:
                vehicle_dict[vehicle_id] = vehicle_id_counter
                writer.writerow([
                    vehicle_id_counter,
                    row['MAKE'], row['MODEL'], row['VEHICLE_YEAR'],
                    row['VEHICLE_DEFECT'], row['VEHICLE_TYPE']
                ])
                vehicle_id_counter += 1

# Process People dataset and write DimPerson and FactCrash tables
with open('DimPerson.csv', 'w', newline='') as dim_person_file, \
     open('FactCrash.csv', 'w', newline='') as fact_crash_file:

    person_writer = csv.writer(dim_person_file)
    fact_writer = csv.writer(fact_crash_file)

    person_writer.writerow(['Person_ID', 'Age', 'Age_Group', 'Role', 'Injury_Classification'])
    fact_writer.writerow([
        'Crash_Detail_ID', 'Damage_Amount', 'num_units', 'Incident_Severity',
        'Participant_Count', 'Weather_ID', 'Vehicle_ID', 'Date_ID',
        'Person_ID', 'Geography_ID', 'Cause_ID', 'Crash_ID'
    ])

    with open('C:/Users/hp/OneDrive/Desktop/LDS24Project/cleaned_Dataset/cleaned_People.csv', 'r') as people_file:
        reader = csv.DictReader(people_file)
        for row in reader:
            person_id = row['PERSON_ID']

            # Process Person Dimension
            if person_id not in person_id_set:
                age = row['AGE']
                
                # Check if AGE is valid and categorize
                if age and not math.isnan(float(age)):
                    age_int = int(float(age))
                    # Determine Age Group based on age ranges
                    if age_int <= 17:
                        age_group = 'CHILD'
                    elif 18 <= age_int <= 35:
                        age_group = 'YOUND ADULT'
                    elif 36 <= age_int <= 64:
                        age_group = 'ADULT'
                    else:
                        age_group = 'SENIOR'
                else:
                    age_group = 'UNKNOWN'  # Leave blank if AGE is NaN or invalid

                role = row['PERSON_TYPE']
                injury_classification = row['INJURY_CLASSIFICATION']
                person_writer.writerow([person_id, age, age_group, role, injury_classification])
                person_id_set.add(person_id)

            # Fact Table Entry
            crash_detail_id = fact_crash_id_counter
            fact_crash_id_counter += 1

            # Retrieve values for FactCrash columns using RD_NO
            damage_amount = row['DAMAGE']
            rd_no = row['RD_NO']
            num_units = rd_no_to_num_units.get(rd_no, '0')
            participant_count = rd_no_to_participant_count.get(rd_no, '0')
            weather_id = rd_no_to_weather_id.get(rd_no, '')
            date_id = rd_no_to_date_id.get(rd_no, '')
            geography_id = rd_no_to_geography_id.get(rd_no, '')
            cause_id = rd_no_to_cause_id.get(rd_no, '')
            crash_id = rd_no_to_crash_id.get(rd_no, '')
            incident_severity = rd_no_to_incident_severity.get(rd_no, '')
            vehicle_id = vehicle_dict.get(row['VEHICLE_ID'], '')

            fact_writer.writerow([
                crash_detail_id, damage_amount, num_units, incident_severity,
                participant_count, weather_id, vehicle_id, date_id,
                person_id, geography_id, cause_id, crash_id
            ])
