# Data preparation notebook

## Download csv files from data source.
The data is fetched from NCDC weather portal's `ftp` endpoint. Data will be downloaded as zipped csv files, extracted into a known folder.

In [2]:
import requests
import ftplib

Establish connection to ftp site

In [7]:
ftp = ftplib.FTP('ftp.ncdc.noaa.gov')
ftp.login()



Change directory to download dir

In [11]:
ftp.cwd('/pub/data/swdi/database-csv/v2/')
ftp.pwd()

'/pub/data/swdi/database-csv/v2'

Get the list of files names. Filter only those files that start with 'hail' in their file name

In [31]:
all_files= ftp.nlst()
hail_files = [i for i in all_files if i.startswith('hail')]
hail_files

['hail-2013.csv.gz',
 'hail-2015.csv.gz',
 'hail-201704.csv.gz',
 'hail-2014.csv.gz',
 'hail-201705.csv.gz',
 'hail-201701.csv.gz',
 'hail-1995.csv.gz',
 'hail-1996.csv.gz',
 'hail-1997.csv.gz',
 'hail-1998.csv.gz',
 'hail-1999.csv.gz',
 'hail-2000.csv.gz',
 'hail-2001.csv.gz',
 'hail-2002.csv.gz',
 'hail-2003.csv.gz',
 'hail-2004.csv.gz',
 'hail-2005.csv.gz',
 'hail-2008.csv.gz',
 'hail-2007.csv.gz',
 'hail-2006.csv.gz',
 'hail-2009.csv.gz',
 'hail-2010.csv.gz',
 'hail-2011.csv.gz',
 'hail-2012.csv.gz',
 'hail-201702.csv.gz',
 'hail-201703.csv.gz',
 'hail-2016.csv.gz']

Download the files to data dir `E:\gis_data\analytics\georgia_hail\raw`

In [32]:
for filename in hail_files:
    print("Downloading " + filename, end=" | ")
    with open(r'E:\GIS_Data\Analytics\Georgia_hailstones\raw\\' 
              + filename + ".csv.gz", "wb") as file_handle:
        ftp.retrbinary("RETR " + filename, file_handle.write)
    print(" finished")

Downloading hail-2013.csv.gz |  finished
Downloading hail-2015.csv.gz |  finished
Downloading hail-201704.csv.gz |  finished
Downloading hail-2014.csv.gz |  finished
Downloading hail-201705.csv.gz |  finished
Downloading hail-201701.csv.gz |  finished
Downloading hail-1995.csv.gz |  finished
Downloading hail-1996.csv.gz |  finished
Downloading hail-1997.csv.gz |  finished
Downloading hail-1998.csv.gz |  finished
Downloading hail-1999.csv.gz |  finished
Downloading hail-2000.csv.gz |  finished
Downloading hail-2001.csv.gz |  finished
Downloading hail-2002.csv.gz |  finished
Downloading hail-2003.csv.gz |  finished
Downloading hail-2004.csv.gz |  finished
Downloading hail-2005.csv.gz |  finished
Downloading hail-2008.csv.gz |  finished
Downloading hail-2007.csv.gz |  finished
Downloading hail-2006.csv.gz |  finished
Downloading hail-2009.csv.gz |  finished
Downloading hail-2010.csv.gz |  finished
Downloading hail-2011.csv.gz |  finished
Downloading hail-2012.csv.gz |  finished
Downloadin

## Extract the downloaded csv files

In [43]:
from glob import glob
import os
import pathlib
import gzip

In [34]:
file_list = glob(r"E:\GIS_Data\Analytics\Georgia_hailstones\raw\*.csv.gz")
file_list

['E:\\GIS_Data\\Analytics\\Georgia_hailstones\\raw\\hail-1995.csv.gz.csv.gz',
 'E:\\GIS_Data\\Analytics\\Georgia_hailstones\\raw\\hail-1996.csv.gz.csv.gz',
 'E:\\GIS_Data\\Analytics\\Georgia_hailstones\\raw\\hail-1997.csv.gz.csv.gz',
 'E:\\GIS_Data\\Analytics\\Georgia_hailstones\\raw\\hail-1998.csv.gz.csv.gz',
 'E:\\GIS_Data\\Analytics\\Georgia_hailstones\\raw\\hail-1999.csv.gz.csv.gz',
 'E:\\GIS_Data\\Analytics\\Georgia_hailstones\\raw\\hail-2000.csv.gz.csv.gz',
 'E:\\GIS_Data\\Analytics\\Georgia_hailstones\\raw\\hail-2001.csv.gz.csv.gz',
 'E:\\GIS_Data\\Analytics\\Georgia_hailstones\\raw\\hail-2002.csv.gz.csv.gz',
 'E:\\GIS_Data\\Analytics\\Georgia_hailstones\\raw\\hail-2003.csv.gz.csv.gz',
 'E:\\GIS_Data\\Analytics\\Georgia_hailstones\\raw\\hail-2004.csv.gz.csv.gz',
 'E:\\GIS_Data\\Analytics\\Georgia_hailstones\\raw\\hail-2005.csv.gz.csv.gz',
 'E:\\GIS_Data\\Analytics\\Georgia_hailstones\\raw\\hail-2006.csv.gz.csv.gz',
 'E:\\GIS_Data\\Analytics\\Georgia_hailstones\\raw\\hail-2007.cs

In [44]:
p = pathlib.Path(file_list[0])
p.name

'hail-1995.csv.gz.csv.gz'

In [47]:
p.stem

'hail-1995.csv.gz.csv'

In [50]:
for current_file in file_list:
    path_file = pathlib.Path(current_file)
    print("Extracting " + path_file.name, end=" | ")
    
    with gzip.open(current_file, 'rb') as archive:
        file_contents = archive.read()
        print("contents read ", end= " | ")

        #write to disk
        with open(r'E:\GIS_Data\Analytics\Georgia_hailstones\raw\extracted\\' + 
                  path_file.stem, 'wb') as output_file:
            output_file.write(file_contents)
       
    print(" finished")

Extracting hail-1995.csv.gz.csv.gz | contents read  |  finished
Extracting hail-1996.csv.gz.csv.gz | contents read  |  finished
Extracting hail-1997.csv.gz.csv.gz | contents read  |  finished
Extracting hail-1998.csv.gz.csv.gz | contents read  |  finished
Extracting hail-1999.csv.gz.csv.gz | contents read  |  finished
Extracting hail-2000.csv.gz.csv.gz | contents read  |  finished
Extracting hail-2001.csv.gz.csv.gz | contents read  |  finished
Extracting hail-2002.csv.gz.csv.gz | contents read  |  finished
Extracting hail-2003.csv.gz.csv.gz | contents read  |  finished
Extracting hail-2004.csv.gz.csv.gz | contents read  |  finished
Extracting hail-2005.csv.gz.csv.gz | contents read  |  finished
Extracting hail-2006.csv.gz.csv.gz | contents read  |  finished
Extracting hail-2007.csv.gz.csv.gz | contents read  |  finished
Extracting hail-2008.csv.gz.csv.gz | contents read  |  finished
Extracting hail-2009.csv.gz.csv.gz | contents read  |  finished
Extracting hail-2010.csv.gz.csv.gz | con