# Project EDA Starter Code

In [None]:
# Run this before any other code cell
# This downloads the data files into the same directory where you have saved this notebook

import urllib.request
from pathlib import Path
import os
path = Path()

# Dictionary of file names and download links
files = {'outage_data.parquet':'https://storage.googleapis.com/aipi_datasets/outage_data.parquet'}

# Download each file
for key,value in files.items():
    filename = path/key
    url = value
    # If the file does not already exist in the directory, download it
    if not os.path.exists(filename):
        urllib.request.urlretrieve(url,filename)

In [None]:
import pandas as pd
import numpy as np

In [None]:
# Read the data into a Pandas dataframe
df = pd.read_parquet(path='./outage_data.parquet', engine='pyarrow')

# Remove duplicate entries in 2019
# Remove all rows with SimStartDate after 2019-01-01 and event_type == 'thunderstorm'
df = df.loc[~((df['SimStartDate'] > '2019-01-01') & (df['event_type'] == 'thunderstorm'))]
df.describe()

In [None]:
# Save feature names to a csv file
df.columns.to_frame().to_csv('features.csv', index=False)

In [None]:
# Count the number of grid cells and outage events
# 488 grid cells and 154 outage events
df.loc[:,['lat','lon']].groupby(['lat','lon']).size().reset_index(name='count').sort_values(by='count', ascending=False)

In [None]:
# Sort events by number of outages
grouped = df.loc[:,['SimStartDate','event_type','outage_count']].groupby(['SimStartDate','event_type']).outage_count.sum().sort_values(ascending=False)
grouped = grouped.reset_index(name='outage_count')
grouped = grouped.set_index('SimStartDate')
grouped

In [None]:
# Sort events by SimStartDate
grouped = df.loc[:,['SimStartDate','event_type','outage_count']].groupby(['SimStartDate','event_type']).outage_count.sum().sort_index()
grouped = grouped.reset_index(name='outage_count')
# save to csv
grouped.to_csv('outage_count.csv')