In [301]:
import pandas as pd
import datetime as dt

In [302]:
# Import weather data file for city of Austin, TX - October 2012 to October 2019
weather = pd.read_csv("weather.csv")

In [303]:
weather.head(1)

Unnamed: 0,dt,dt_iso,city_id,city_name,lat,lon,temp,temp_min,temp_max,pressure,...,rain_today,snow_1h,snow_3h,snow_24h,snow_today,clouds_all,weather_id,weather_main,weather_description,weather_icon
0,1349096400,2012-10-01 13:00:00 +0000 UTC,4671654,,,,288.9,287.15,290.37,1012,...,,,,,,1,741,Fog,fog,50d


In [304]:
# Drop columns that dont have data we want. 
# weather_main and weather_description can be accesses using table_codes dataframe >> weather_id

columns = ['dt', 'city_id', 'weather_icon', 'sea_level', 'grnd_level', 
           'snow_1h', 'snow_3h', 'snow_24h', 'rain_1h', 'rain_3h', 'rain_24h',
          'weather_main', 'weather_description']

weather = weather.drop(columns, axis=1)
weather.head(1)

Unnamed: 0,dt_iso,city_name,lat,lon,temp,temp_min,temp_max,pressure,humidity,wind_speed,wind_deg,rain_today,snow_today,clouds_all,weather_id
0,2012-10-01 13:00:00 +0000 UTC,,,,288.9,287.15,290.37,1012,93,2,290,,,1,741


In [305]:
# Rename columns for units clarity

rename = {
        "dt_iso" : "date", 
        "city_name" : "city", 
        "temp" : "temp_K", 
        "temp_min" : "temp_min_K",
        "temp_max" : "temp_max_K", 
        "pressure" : "pres_hPa", 
        "humidity" : "humid_%", 
        "wind_speed" : "wind_mph",
        "rain_today" : "rain_mm", 
        "snow_today" : "snow_mm",
        "clouds_all" : "cloud_%"
}

weather = weather.rename(rename, axis=1)
weather.head(1)

Unnamed: 0,date,city,lat,lon,temp_K,temp_min_K,temp_max_K,pres_hPa,humid_%,wind_mph,wind_deg,rain_mm,snow_mm,cloud_%,weather_id
0,2012-10-01 13:00:00 +0000 UTC,,,,288.9,287.15,290.37,1012,93,2,290,,,1,741


In [306]:
# Format date

weather['date'] = pd.to_datetime(weather['date'], format = '%Y-%m-%d %H:%M:%S +%f %Z').dt.date
weather.head(1)

Unnamed: 0,date,city,lat,lon,temp_K,temp_min_K,temp_max_K,pres_hPa,humid_%,wind_mph,wind_deg,rain_mm,snow_mm,cloud_%,weather_id
0,2012-10-01,,,,288.9,287.15,290.37,1012,93,2,290,,,1,741


In [309]:
# Convert Units >> hPa = 100 Pa, Pa / 101325 = atm
# rain_in
# snow_in

weather['temp_F'] = ((weather['temp_K'] - 273.15) * (9/5)) + 32
weather['temp_min_F'] = ((weather['temp_min_K'] - 273.15) * (9/5)) + 32
weather['temp_max_F'] = ((weather['temp_max_K'] - 273.15) * (9/5)) + 32
weather['pres_atm'] = (weather['pres_hPa'] * 100) / 101325
weather['rain_in'] = weather['rain_mm'] / 25.4
weather['snow_in'] = weather['snow_mm'] / 25.4

weather.head(1)

Unnamed: 0,date,city,lat,lon,temp_K,temp_min_K,temp_max_K,pres_hPa,humid_%,wind_mph,...,rain_mm,snow_mm,cloud_%,weather_id,temp_F,temp_min_F,temp_max_F,pres_atm,rain_in,snow_in
0,2012-10-01,,,,288.9,287.15,290.37,1012,93,2,...,,,1,741,60.35,57.2,62.996,0.998766,,


In [285]:
# Insert Austin data not included in table
weather['city'] = 'Austin'
weather['lat'] = 30.2672
weather['lon'] = 97.7431
weather.head(1)

Unnamed: 0,date,city,lat,lon,temp_K,temp_min,temp_max_K,pres_hPa,humid_%,wind_mph,wind_deg,rain_mm,snow_mm,cloud_%,weather_id
0,2012-10-01,Austin,30.2672,97.7431,288.9,287.15,290.37,1012,93,2,290,,,1,741


In [135]:
# CSV file has weather condition codes that correspond to descriptions. 
# Pull data from website into tables using Pandas.
# Same for units used in CSV.

In [137]:
# Weather Condition Codes

url_codes = 'https://openweathermap.org/weather-conditions'
table_codes = pd.read_html(url_codes)

# First table is icons, removed
table_codes.pop(0)

# Number of tables retrieved from website
len(table_codes)

7

In [138]:
table_codes[0].head(1)

Unnamed: 0,ID,Main,Description,Icon
0,200,Thunderstorm,thunderstorm with light rain,11d


In [139]:
table_codes[1].head(1)

Unnamed: 0,ID,Main,Description,Icon
0,300,Drizzle,light intensity drizzle,09d


In [140]:
# column labels are different than first two tables
table_codes[2].head(1)

Unnamed: 0,0,1,2,3
0,ID,Main,Description,Icon


In [146]:
# Column titles do not match, need to munge data to make it all fit in one table

# columns with titles into one df
df1 = pd.concat([table_codes[0], table_codes[1], table_codes[5], table_codes[6]])

# columns with no titles into one df, and then use labels from df above
df2 = pd.concat([table_codes[2], table_codes[3], table_codes[4]])
df2.columns = ['ID', 'Main', 'Description', 'Icon']

# squish it all together on same column names
table_codes_all = pd.concat([df1, df2])
table_codes_all.sort_values('Icon', ascending=False).head(5)

Unnamed: 0,ID,Main,Description,Icon
0,ID,Main,Description,Icon
0,ID,Main,Description,Icon
0,ID,Main,Description,Icon
10,781,Tornado,tornado,50d
4,731,Dust,sand/ dust whirls,50d


In [230]:
### I cannot figure out how to delete the rows with the headers in them ###

# Need to delete random column titles in rows
table_codes_all[~table_codes_all.Icon.str.startswith('Icon')]
table_codes_all.sort_values('Icon', ascending=False).head(5)

Unnamed: 0,ID,Main,Description,Icon
0,ID,Main,Description,Icon
0,ID,Main,Description,Icon
0,ID,Main,Description,Icon
10,781,Tornado,tornado,50d
4,731,Dust,sand/ dust whirls,50d


In [150]:
# Units Tables

url_units = 'http://openweathermap.org/weather-data'
table_units = pd.read_html(url_units)

In [151]:
# Number of tables retrieved from website
len(table_units)

3

In [229]:
# table_units[0] = Parameters of API respond for current and historical weather
table_units[0].head(5)

Unnamed: 0,Parameter,Description,Standard,Metric,Imperial
0,id,City identification,-,-,-
1,dt,Data receiving time,"unix, UTC","unix, UTC","unix, UTC"
2,name,City name,-,-,-
3,coord,coord,coord,coord,coord
4,lat,"City geo location, latitude",-,-,-
