### Script to extract locally copied Aerosol data into csv for further analysis

### IMPORTANT : Run this only after FTPDownload_Aerosol_NOAA.ipynb

This script extract data from multiple gzip files, cleans and summarises data. Stores in csv files

In [68]:
#Import dependencies
import gzip
import os
import pandas as pd
import numpy as np
from calendar import monthrange

In [74]:
# Iniiialise dataframe and any declarations

getItems = ['Station GAW-ID:', 'Station state/province:','Measurement latitude:','Measurement longitude:']

monthly_avg_AER = pd.DataFrame(columns = ['StationID', 'Location','latitude','longitude',\
                                         'Year','Month', 'Avg.Conc'])


In [70]:
# path to the gzip files
path = "..\data\rawdata\_aerosol"

# get the list of stations to loop through
aer_stations = []
for roots,dirs,files in os.walk(path):
    if(dirs):
        aer_stations = dirs

aer_stations

['_bnd', '_brw', '_mlo', '_smo', '_thd']

In [75]:
#Loop through stations to extract data
for s in aer_stations:
    print(f"Extracting and Loading data for station - {s} ")
    
    for root, dirs, files in os.walk(os.path.join(path,s)):
        for name in files:
            if(name.rsplit('.',1)[1] == 'gz'):

                file_content = []
                #open gz file in rt mode - opens in read & text mode
                with gzip.open(os.path.join(root,name), 'rt') as f:
                    file_content = f.read().splitlines()

                if(file_content):
                    ### Collect Station Details
                    Station_Info = {'Station GAW-ID:':['None'], 'Station state/province:':['None'],\
                                    'Measurement latitude:':['None'],'Measurement longitude:':['None']}          

                    for itm in getItems:
                        Station_Info[itm] = [file_content[i].rsplit(" ",1)[1] for i, s in enumerate(file_content) if itm in s]

                    for key,val in Station_Info.items():
                        if(not Station_Info[key]):
                            Station_Info[key] = ['None']
                        
                                       
                    ### Collect Data
                    dataStrt = int(file_content[0].split(" ")[0])
                    yr =  int(file_content[6].split(" ")[0])
                    cols = [v for v in file_content[dataStrt-1].split(" ") if v != ''] 

                    station_data  = []

                    for i, row in enumerate(file_content[dataStrt:]):
                        station_data.append([v for v in row.split(" ") if v != ''])

                    
                    #temp dataframe to store the station data
                    df = pd.DataFrame(station_data, columns = cols)

                    ### CLean Data
                    #drop cols not required
                    df = df[['start_time','end_time','st_y','ed_y','conc']]

                    df = df.apply(pd.to_numeric)

                    df = df.replace(99999.9, value = np.nan)

                    #Get num of days per year
                    days_in_mnth = []

                    for i in range(1,13,1):
                        days_in_mnth.append(monthrange(yr, i)[1]*24)

                    days_in_mnth = np.cumsum(days_in_mnth)

                    ### Get Monthly avg
                    df_monthly_avg = pd.DataFrame(columns = ['StationID', 'Location','latitude','longitude',\
                                                 'Year','Month', 'Avg.Conc'])
                    strt = 0

                    for i, val in enumerate(days_in_mnth):
                        df_monthly_avg.loc[i] = [Station_Info['Station GAW-ID:'][0], Station_Info['Station state/province:'][0],\
                                                 Station_Info['Measurement latitude:'][0],Station_Info['Measurement longitude:'][0],\
                                                 yr, i+1, df[strt:val]['conc'].mean()]
                        strt = val+1

                    ## append with master DF
                    monthly_avg_AER = monthly_avg_AER.append(df_monthly_avg, sort=False)

print("Extraction done and Data Load to dataframe")
print(f"Total data loaded is {len(monthly_avg_AER)}")

Extracting and Loading data for station - _bnd 
Extracting and Loading data for station - _brw 
Extracting and Loading data for station - _mlo 
Extracting and Loading data for station - _smo 
Extracting and Loading data for station - _thd 
Extraction done and Data Load to dataframe
Total data loaded is 1944


In [77]:
monthly_avg_AER.head()

Unnamed: 0,StationID,Location,latitude,longitude,Year,Month,Avg.Conc
0,BND,Illinois,40.05,-88.36667,1994,1,7416.095385
1,BND,Illinois,40.05,-88.36667,1994,2,8250.276304
2,BND,Illinois,40.05,-88.36667,1994,3,7657.565079
3,BND,Illinois,40.05,-88.36667,1994,4,
4,BND,Illinois,40.05,-88.36667,1994,5,


In [79]:
# Collate only data upto 2017
monthly_avg_AER = monthly_avg_AER.loc[(monthly_avg_AER['Year'] >= 1975 and monthly_avg_AER['Year'] <= 2017),:]

print(f"Data starts from year {monthly_avg_AER['Year'].min()} and ends in {monthly_avg_AER['Year'].max()}")

Data starts from year 1974 and ends in 2017


In [80]:
# Reshape Data with Month along the column instead of rows
monthly_avg_AER = monthly_avg_AER.pivot_table(index = ['StationID', 'Location','latitude','longitude','Year'], \
                           columns = 'Month', values = 'Avg.Conc')

monthly_avg_AER.reset_index(inplace = True)

monthly_avg_AER.head()

Month,StationID,Location,latitude,longitude,Year,1,2,3,4,5,6,7,8,9,10,11,12
0,BND,Illinois,40.05,-88.36667,1994,7416.095385,8250.276304,7657.565079,,,,,,,,,
1,BND,Illinois,40.05,-88.36667,1995,5968.775814,7162.291803,8252.129868,6663.806811,6226.199859,4933.423659,6183.971968,4861.151709,8760.603648,6857.581215,5655.19415,6026.326203
2,BND,Illinois,40.05,-88.36667,1996,,,,1100.308333,3446.747376,3127.360201,4073.380619,6261.03207,6619.884828,6095.886916,,3659.170427
3,BND,Illinois,40.05,-88.36667,1997,5061.243383,3513.422356,4041.113459,2966.039888,2012.236145,,3886.328283,5925.26087,7167.643339,9868.816059,5873.094715,4452.612195
4,BND,Illinois,40.05,-88.36667,1998,4977.414286,2695.786942,1385.488854,2085.668524,3734.474488,7184.377101,6224.556716,5529.677486,7570.577287,6596.904367,5956.719236,3211.30647


In [81]:

# Change col names to month names
mnth_names = {1:'Jan',2:'Feb',3:'Mar',4:'Apr',5:'May',6:'Jun',7:'Jul',8:'Aug',9:'Sep',10:'Oct',11:'Nov', 12:'Dec'}

monthly_avg_AER.rename(columns = mnth_names, inplace = True)
monthly_avg_AER.rename_axis("", axis = 1, inplace = True)

monthly_avg_AER.head()

Unnamed: 0,StationID,Location,latitude,longitude,Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
0,BND,Illinois,40.05,-88.36667,1994,7416.095385,8250.276304,7657.565079,,,,,,,,,
1,BND,Illinois,40.05,-88.36667,1995,5968.775814,7162.291803,8252.129868,6663.806811,6226.199859,4933.423659,6183.971968,4861.151709,8760.603648,6857.581215,5655.19415,6026.326203
2,BND,Illinois,40.05,-88.36667,1996,,,,1100.308333,3446.747376,3127.360201,4073.380619,6261.03207,6619.884828,6095.886916,,3659.170427
3,BND,Illinois,40.05,-88.36667,1997,5061.243383,3513.422356,4041.113459,2966.039888,2012.236145,,3886.328283,5925.26087,7167.643339,9868.816059,5873.094715,4452.612195
4,BND,Illinois,40.05,-88.36667,1998,4977.414286,2695.786942,1385.488854,2085.668524,3734.474488,7184.377101,6224.556716,5529.677486,7570.577287,6596.904367,5956.719236,3211.30647


In [82]:
# convert Concentration values to numeric and round off at 2nd decimal place
monthly_avg_AER.iloc[:, 2:].apply(pd.to_numeric)
monthly_avg_AER = monthly_avg_AER.round(decimals = 2)

monthly_avg_AER.head()

Unnamed: 0,StationID,Location,latitude,longitude,Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
0,BND,Illinois,40.05,-88.36667,1994,7416.1,8250.28,7657.57,,,,,,,,,
1,BND,Illinois,40.05,-88.36667,1995,5968.78,7162.29,8252.13,6663.81,6226.2,4933.42,6183.97,4861.15,8760.6,6857.58,5655.19,6026.33
2,BND,Illinois,40.05,-88.36667,1996,,,,1100.31,3446.75,3127.36,4073.38,6261.03,6619.88,6095.89,,3659.17
3,BND,Illinois,40.05,-88.36667,1997,5061.24,3513.42,4041.11,2966.04,2012.24,,3886.33,5925.26,7167.64,9868.82,5873.09,4452.61
4,BND,Illinois,40.05,-88.36667,1998,4977.41,2695.79,1385.49,2085.67,3734.47,7184.38,6224.56,5529.68,7570.58,6596.9,5956.72,3211.31


In [83]:
#Obtain row-wise avg to replace NAN
rowAvg = monthly_avg_AER[['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']].mean(axis = 1)
rowAvg

# Replace NAN with row-wise mean
monthly_avg_AER = monthly_avg_AER.apply(lambda r: r.fillna(rowAvg[r.index]))
monthly_avg_AER = monthly_avg_AER.round(decimals = 2)

#Check if NAN exists
print(f"There are {monthly_avg_AER.isnull().values.sum()} null values")

monthly_avg_AER.head()


0


Unnamed: 0,StationID,Location,latitude,longitude,Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
0,BND,Illinois,40.05,-88.36667,1994,7416.1,8250.28,7657.57,7774.65,7774.65,7774.65,7774.65,7774.65,7774.65,7774.65,7774.65,7774.65
1,BND,Illinois,40.05,-88.36667,1995,5968.78,7162.29,8252.13,6663.81,6226.2,4933.42,6183.97,4861.15,8760.6,6857.58,5655.19,6026.33
2,BND,Illinois,40.05,-88.36667,1996,4297.97,4297.97,4297.97,1100.31,3446.75,3127.36,4073.38,6261.03,6619.88,6095.89,4297.97,3659.17
3,BND,Illinois,40.05,-88.36667,1997,5061.24,3513.42,4041.11,2966.04,2012.24,4978.89,3886.33,5925.26,7167.64,9868.82,5873.09,4452.61
4,BND,Illinois,40.05,-88.36667,1998,4977.41,2695.79,1385.49,2085.67,3734.47,7184.38,6224.56,5529.68,7570.58,6596.9,5956.72,3211.31


In [87]:
#roll up data to global level by yearly and monthly
global_avg_AER = monthly_avg_AER[['Year','Jan','Feb','Mar','Apr','May','Jun','Jul','Aug',\
                                  'Sep','Oct','Nov','Dec']].groupby('Year').agg(np.mean)

global_avg_AER = global_avg_AER.round(decimals = 2)

global_avg_AER.head(20)

Unnamed: 0_level_0,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1975,795.03,795.03,795.03,451.73,795.03,580.06,1179.1,711.68,818.22,685.29,1304.93,629.23
1976,294.08,603.32,553.45,687.52,923.89,594.8,503.14,309.01,680.04,617.39,453.96,420.8
1977,351.42,360.2,477.35,370.58,249.07,267.82,252.7,343.96,466.99,372.39,359.08,343.99
1978,365.6,304.98,369.18,316.16,311.91,328.1,292.04,471.29,278.78,211.77,289.08,291.86
1979,339.98,454.37,428.58,361.95,272.81,345.56,504.45,358.89,287.38,288.07,276.47,269.1
1980,286.7,265.14,280.23,232.85,193.63,456.52,292.32,318.44,292.24,270.09,237.37,433.11
1981,414.78,394.1,326.46,320.69,342.3,401.25,637.47,532.01,420.42,402.81,363.78,521.2
1982,464.9,409.2,401.59,280.6,244.69,402.03,393.3,363.9,354.81,233.98,243.73,280.15
1983,221.37,267.56,334.12,387.8,386.2,277.29,341.75,395.37,388.78,346.38,232.03,230.69
1984,343.67,821.52,638.17,203.08,294.59,354.69,855.39,1264.49,322.51,273.27,237.65,224.45


In [89]:
#write raw Aerosol data and summary Aerosol data to csv files
monthly_avg_AER.to_csv("..\data\AER_RawData.csv", index = False)  ## Entire Raw DAta with stations

global_avg_AER.to_csv("..\data\AER_GlobalSummary.csv", index = False) ## Grouped by year and reported monthly at global lvl