In [None]:
import pandas as pd
import requests
import json
import os
import datetime

pd.set_option('display.max_rows', 999)
pd.set_option('display.max_columns', 999)
pd.set_option('display.width', 999)


In [None]:
from dotenv import load_dotenv
load_dotenv()

api_key = os.getenv('WEATHER_API_KEY')
print(api_key)
endpoint = 'https://api.oikolab.com/weather'

We're dropping any data that doesn't have a longitude or latitude.
We also dropped any data with a subjectivity score of 0 because those posts are likely to be less fueled by emotion and mood.

In [None]:
files = os.listdir('data/dutch_tweets/')
df = pd.DataFrame()

for file in files:
    dfTemp = pd.read_json(f'data/dutch_tweets/{file}')
    df = pd.concat([df, dfTemp])
df.dropna(subset=['longitude', 'latitude'], inplace=True)
df['longitude'] = df['longitude'].round(1)
df['latitude'] = df['latitude'].round(1)
df = df[df['subjective_pattern'] != 0]

# Use apply with a lambda function for concatenation
df['lat_long'] = df.apply(lambda row: str(row['latitude']) + ', ' + str(row['longitude']), axis=1)

In [None]:
def unix_ms_to_datetime(date):
    if(type(date) == 'timestamp'):
        return date.tz_localize(None)
    elif isinstance(date, str) and ':' in date: 
        return pd.to_datetime(date).tz_localize(None)
    else:
        return pd.to_datetime(date, unit='ms').tz_localize(None)

df['created_at'] = df['created_at'].apply(unix_ms_to_datetime)

Also, places with the latitude and longitude of 52.5, 5.7 are locations marked as "The Netherlands" this means that these datapoints are useless to us because the location is too general.

In [None]:
amsterdam_lat_long = '52.4, 4.9'
rotterdam_lat_long = '51.9, 4.5'
hague_lat_long = '52.1, 4.3'

amsterdam = df[df['lat_long'] == '52.4, 4.9']
rotterdam = df[df['lat_long'] == '51.9, 4.5']
hague = df[df['lat_long'] == '52.1, 4.3']

We decided to use datapoints from amsterdam, rotterdam, and the hague

Available Data appears to be from Jan 22 2020 to September 25 2020, a range of 248 days

### Gather Weather data for the three cities

In [None]:
startdate = df['created_at'].min()
enddate = df['created_at'].max()
print('Start:', startdate)
print('End:', enddate)
date = datetime.datetime(startdate.year, startdate.month, startdate.day, 4, )
display(date)

def get_morning(day):
    return datetime.datetime(day.year, day.month, day.day, 10, 0, 0)

def get_evening(day):
    return datetime.datetime(day.year, day.month, day.day, 16, 0, 0)

plusOneDay = datetime.timedelta(days=1)

times = []

for i in range(248):
    times.append(get_morning(date))
    times.append(get_evening(date))

    date = date + plusOneDay

# Remove the first morning and the last evening because they are oustide of the range of our data
times.pop(0)
times.pop()

### Testing API Calls

In [None]:
# parameters = surface_solar_radiation relative_humidity and surface_pressure (not sure on humidity)

run = False 
if(run == True):
    plusHour = datetime.timedelta(hours=1)
    r = requests.get(endpoint,
        params={
            'param': 'surface_pressure', 
            'start': startdate, 
            'end': startdate + plusHour, 
            'lat': 52.4, 
            'lon': 4.9,
        },
        headers={'api-key': api_key}
    )

    data = json.loads(r.json()['data'])

    display(data)

    weather = pd.DataFrame([data['data'][0]],
        columns=['coordinates', 'model (name)', 'model elevation', 'utc_offset', 'surface_pressure (Pa)'])
    display(weather)
    display(r)

In [None]:
import json
r = requests.get(endpoint,
    params={
        'param': 'surface_solar_radiation', 
        'start': startdate - plusHour, # Subtract an hour to put this dutch time at UTC
        'end': startdate, # End it an hour later
        'lat': 51.9, 
        'lon': 4.5,
    },
    headers={'api-key': api_key}
)

# (WARNING) Actually Gathering Weather Data. Don't waste api calls

In [9]:
datas = []
amsterdam_lat_long = '52.4, 4.9'
rotterdam_lat_long = '51.9, 4.5'
hague_lat_long = '52.1, 4.3'
plusHour = datetime.timedelta(hours=1)
for time in times:
    print(time)
    r = requests.get(endpoint,
        params={
            'param': 'surface_pressure', 
            'start': startdate - plusHour, # Subtract an hour to put this dutch time at UTC
            'end': startdate, # End it an hour later
            'lat': 51.9, 
            'lon': 4.5,
        },
        headers={'api-key': api_key}
    )
    data = json.loads(r.json()['data'])
    data = data['data'][0]
    data.insert(0, time)
    datas.append(data)



2020-01-22 16:00:00
2020-01-23 10:00:00
2020-01-23 16:00:00
2020-01-24 10:00:00
2020-01-24 16:00:00
2020-01-25 10:00:00
2020-01-25 16:00:00
2020-01-26 10:00:00
2020-01-26 16:00:00
2020-01-27 10:00:00
2020-01-27 16:00:00
2020-01-28 10:00:00
2020-01-28 16:00:00
2020-01-29 10:00:00
2020-01-29 16:00:00
2020-01-30 10:00:00
2020-01-30 16:00:00
2020-01-31 10:00:00
2020-01-31 16:00:00
2020-02-01 10:00:00
2020-02-01 16:00:00
2020-02-02 10:00:00
2020-02-02 16:00:00
2020-02-03 10:00:00
2020-02-03 16:00:00
2020-02-04 10:00:00
2020-02-04 16:00:00
2020-02-05 10:00:00
2020-02-05 16:00:00
2020-02-06 10:00:00
2020-02-06 16:00:00
2020-02-07 10:00:00
2020-02-07 16:00:00
2020-02-08 10:00:00
2020-02-08 16:00:00
2020-02-09 10:00:00
2020-02-09 16:00:00
2020-02-10 10:00:00
2020-02-10 16:00:00
2020-02-11 10:00:00
2020-02-11 16:00:00
2020-02-12 10:00:00
2020-02-12 16:00:00
2020-02-13 10:00:00
2020-02-13 16:00:00
2020-02-14 10:00:00
2020-02-14 16:00:00
2020-02-15 10:00:00
2020-02-15 16:00:00
2020-02-16 10:00:00


In [10]:

display(datas)
weather = pd.DataFrame(datas,
    columns=['date', 'coordinates', 'model (name)', 'model elevation', 'utc_offset', 'surface_pressure (Pa)'])
weather.to_csv('data/weather/rotterdam_pressure.csv')

[[datetime.datetime(2020, 1, 22, 16, 0),
  '(51.9, 4.5)',
  'era5',
  0.93,
  1.0,
  103607.6],
 [datetime.datetime(2020, 1, 23, 10, 0),
  '(51.9, 4.5)',
  'era5',
  0.93,
  1.0,
  103607.6],
 [datetime.datetime(2020, 1, 23, 16, 0),
  '(51.9, 4.5)',
  'era5',
  0.93,
  1.0,
  103607.6],
 [datetime.datetime(2020, 1, 24, 10, 0),
  '(51.9, 4.5)',
  'era5',
  0.93,
  1.0,
  103607.6],
 [datetime.datetime(2020, 1, 24, 16, 0),
  '(51.9, 4.5)',
  'era5',
  0.93,
  1.0,
  103607.6],
 [datetime.datetime(2020, 1, 25, 10, 0),
  '(51.9, 4.5)',
  'era5',
  0.93,
  1.0,
  103607.6],
 [datetime.datetime(2020, 1, 25, 16, 0),
  '(51.9, 4.5)',
  'era5',
  0.93,
  1.0,
  103607.6],
 [datetime.datetime(2020, 1, 26, 10, 0),
  '(51.9, 4.5)',
  'era5',
  0.93,
  1.0,
  103607.6],
 [datetime.datetime(2020, 1, 26, 16, 0),
  '(51.9, 4.5)',
  'era5',
  0.93,
  1.0,
  103607.6],
 [datetime.datetime(2020, 1, 27, 10, 0),
  '(51.9, 4.5)',
  'era5',
  0.93,
  1.0,
  103607.6],
 [datetime.datetime(2020, 1, 27, 16, 0),

In [13]:
datas = []
amsterdam_lat_long = '52.4, 4.9'
rotterdam_lat_long = '51.9, 4.5'
hague_lat_long = '52.1, 4.3'
plusHour = datetime.timedelta(hours=1)
for time in times:
    print(time)
    r = requests.get(endpoint,
        params={
            'param': 'surface_solar_radiation', 
            'start': startdate - plusHour, # Subtract an hour to put this dutch time at UTC
            'end': startdate, # End it an hour later
            'lat': 51.9, 
            'lon': 4.5,
        },
        headers={'api-key': api_key}
    )
    data = json.loads(r.json()['data'])
    print(data)
    data = data['data'][0]
    data.insert(0, time)
    datas.append(data)



2020-01-22 16:00:00
{'columns': ['coordinates (lat,lon)', 'model (name)', 'model elevation (surface)', 'utc_offset (hrs)', 'surface_solar_radiation (W/m^2)'], 'index': [1579708800], 'data': [['(51.9, 4.5)', 'era5', 0.93, 1.0, 12.1]]}
2020-01-23 10:00:00
{'columns': ['coordinates (lat,lon)', 'model (name)', 'model elevation (surface)', 'utc_offset (hrs)', 'surface_solar_radiation (W/m^2)'], 'index': [1579708800], 'data': [['(51.9, 4.5)', 'era5', 0.93, 1.0, 12.1]]}
2020-01-23 16:00:00
{'columns': ['coordinates (lat,lon)', 'model (name)', 'model elevation (surface)', 'utc_offset (hrs)', 'surface_solar_radiation (W/m^2)'], 'index': [1579708800], 'data': [['(51.9, 4.5)', 'era5', 0.93, 1.0, 12.1]]}
2020-01-24 10:00:00
{'columns': ['coordinates (lat,lon)', 'model (name)', 'model elevation (surface)', 'utc_offset (hrs)', 'surface_solar_radiation (W/m^2)'], 'index': [1579708800], 'data': [['(51.9, 4.5)', 'era5', 0.93, 1.0, 12.1]]}
2020-01-24 16:00:00
{'columns': ['coordinates (lat,lon)', 'model

KeyboardInterrupt: 

In [12]:

display(datas)
weather = pd.DataFrame(datas,
    columns=['date', 'coordinates', 'model (name)', 'model elevation', 'utc_offset', 'surface_solar_radiation'])
weather.to_csv('data/weather/rotterdam_radiation.csv')

[[datetime.datetime(2020, 1, 22, 16, 0),
  '(51.9, 4.5)',
  'era5',
  0.93,
  1.0,
  12.1],
 [datetime.datetime(2020, 1, 23, 10, 0),
  '(51.9, 4.5)',
  'era5',
  0.93,
  1.0,
  12.1],
 [datetime.datetime(2020, 1, 23, 16, 0),
  '(51.9, 4.5)',
  'era5',
  0.93,
  1.0,
  12.1],
 [datetime.datetime(2020, 1, 24, 10, 0),
  '(51.9, 4.5)',
  'era5',
  0.93,
  1.0,
  12.1],
 [datetime.datetime(2020, 1, 24, 16, 0),
  '(51.9, 4.5)',
  'era5',
  0.93,
  1.0,
  12.1],
 [datetime.datetime(2020, 1, 25, 10, 0),
  '(51.9, 4.5)',
  'era5',
  0.93,
  1.0,
  12.1],
 [datetime.datetime(2020, 1, 25, 16, 0),
  '(51.9, 4.5)',
  'era5',
  0.93,
  1.0,
  12.1],
 [datetime.datetime(2020, 1, 26, 10, 0),
  '(51.9, 4.5)',
  'era5',
  0.93,
  1.0,
  12.1],
 [datetime.datetime(2020, 1, 26, 16, 0),
  '(51.9, 4.5)',
  'era5',
  0.93,
  1.0,
  12.1],
 [datetime.datetime(2020, 1, 27, 10, 0),
  '(51.9, 4.5)',
  'era5',
  0.93,
  1.0,
  12.1],
 [datetime.datetime(2020, 1, 27, 16, 0),
  '(51.9, 4.5)',
  'era5',
  0.93,
  1.