# Import packets

In [1]:
import pandas as pd
import numpy as np

from functions import *

# Load historical data

## air quality data

In [15]:
df_air_quality = pd.read_csv('oslo_air-quality(hjortnes).csv')
df_air_quality.head()

# don't have information for so2 and o3 for the recent years
df_air_quality = df_air_quality.drop(columns=[' so2',' o3'])

# the information for co is quite stable
df_air_quality = df_air_quality.drop(columns=[' co'])

# delete the space in the head's name so it can be saved in the feature group as a key
df_air_quality.rename(columns = {' pm25':'pm25',' pm10':'pm10', ' no2':'no2'}, inplace = True)

# the date format in the original csv file doesn't match timestamp format
df_air_quality.date = pd.to_datetime(df_air_quality.date, format='%Y/%m/%d').dt.date

# only want the data in the time period corresponding to the air quality dataset
df_air_quality = df_air_quality[(df_air_quality['date'] >= pd.to_datetime('2020-05-01')) 
                              & (df_air_quality['date'] <= pd.to_datetime('2023-01-10'))]
df_air_quality.date = df_air_quality.date.apply(timestamp_2_time)

# transfrom the downloaded string type to float type
df_air_quality['pm25'] = pd.Series([float(item) if item != ' ' else np.nan for item in df_air_quality['pm25']])
df_air_quality['pm10'] = pd.Series([float(item) if item != ' ' else np.nan for item in df_air_quality['pm10']])
df_air_quality['no2'] = pd.Series([float(item) if item != ' ' else np.nan for item in df_air_quality['no2']])

# fill in the empty value with median
def impute_nan(df,column,mode):
    df[column]=df[column].fillna(mode)
impute_nan(df_air_quality,'pm25',df_air_quality.pm25.median())
impute_nan(df_air_quality,'pm10',df_air_quality.pm10.median())
impute_nan(df_air_quality,'no2',df_air_quality.no2.median())

# add aqi information to the air quality dataset
df_air_quality['aqi'] = 0.0
for index, row in df_air_quality.iterrows():
    aqi =max(row['pm25'],row['pm10'],row['no2'])
    df_air_quality.loc[index,'aqi'] = aqi

df_air_quality.head()
# df_air_quality.isnull().sum()



Unnamed: 0,date,pm25,pm10,no2,aqi
0,1672588800000,32.0,21.0,27.0,32.0
1,1672675200000,55.0,6.0,7.0,55.0
2,1672761600000,15.0,12.0,9.0,15.0
3,1672848000000,28.0,7.0,10.0,28.0
4,1672934400000,24.0,18.0,14.0,24.0


## weather data

In [3]:
# the time period is from 2020/05/01 to 2023/01/10
df_weather = pd.read_csv('oslo_weather.csv')
df_weather.datetime = df_weather.datetime.apply(timestamp_2_time)

# drop irrelevant or redundant input information
df_weather = df_weather.drop(columns=['name','stations','description','icon',
                                      'precipprob','preciptype',
                                      'feelslikemax','feelslikemin','feelslike',
                                      'solarradiation','solarenergy',
                                      'sunrise','sunset','severerisk',
                                      'sealevelpressure','moonphase'])

# replace string in 'conditions' with integer
df_weather['conditions'] = df_weather['conditions'].replace(['Rain','Clear','Snow','Partially cloudy','Overcast','Snow, Partially cloudy',
                                                             'Rain, Partially cloudy','Rain, Overcast','Snow, Overcast',
                                                             'Snow, Freezing Drizzle/Freezing Rain, Overcast','Snow, Rain',
                                                             'Snow, Rain, Freezing Drizzle/Freezing Rain, Ice, Overcast',
                                                             'Snow, Rain, Freezing Drizzle/Freezing Rain, Overcast','Snow, Rain, Ice, Overcast',
                                                             'Snow, Rain, Overcast','Snow, Rain, Partially cloudy'],[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15])

df_weather.rename(columns = {'datetime':'date'}, inplace = True)

# print(df_weather.isnull().sum())
def impute_nan(df,column,mode):
    df[column]=df[column].fillna(mode)
impute_nan(df_weather,'windgust',df_weather.windgust.median())

df_weather.head(3)

Unnamed: 0,date,tempmax,tempmin,temp,dew,humidity,precip,precipcover,snow,snowdepth,windgust,windspeed,winddir,cloudcover,visibility,uvindex,conditions
0,1588262400000,11.3,-0.2,5.0,2.3,84.3,10.851,66.67,1.4,2.8,34.4,25.5,53.4,90.5,29.5,6,14
1,1588348800000,14.1,3.0,8.2,3.1,72.2,0.438,20.83,3.8,2.1,32.0,17.6,228.4,60.4,46.6,9,6
2,1588435200000,12.9,2.9,7.4,0.6,64.2,4.637,33.33,1.7,0.8,32.4,20.7,17.2,36.6,44.2,7,6


# Connect to Hopsworks Feature Store

In [4]:
import hopsworks

project = hopsworks.login()

fs = project.get_feature_store() 

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/5316
Connected. Call `.close()` to terminate connection gracefully.




# Create feature groups

## air quality data

In [16]:
air_quality_fg = fs.get_or_create_feature_group(
        name = 'oslo_air_quality_fg',
        description = 'Air Quality characteristics of each day',
        version = 1,
        primary_key = ['date'],
        online_enabled = True,
        event_time = 'date'
    )    

air_quality_fg.insert(df_air_quality)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/5316/fs/5236/fg/15770


Uploading Dataframe: 0.00% |          | Rows 0/962 | Elapsed Time: 00:00 | Remaining Time: ?

Launching offline feature group backfill job...
Backfill Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/5316/jobs/named/oslo_air_quality_fg_1_offline_fg_backfill/executions


(<hsfs.core.job.Job at 0x19eb9569bb0>, None)

## weather data

In [6]:
weather_fg = fs.get_or_create_feature_group(
        name = 'oslo_weather_fg',
        description = 'Weather characteristics of each day',
        version = 1,
        primary_key = ['date'],
        online_enabled = True,
        event_time = 'date'
    )    

weather_fg.insert(df_weather)

Uploading Dataframe: 0.00% |          | Rows 0/985 | Elapsed Time: 00:00 | Remaining Time: ?

Launching offline feature group backfill job...
Backfill Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/5316/jobs/named/oslo_weather_fg_1_offline_fg_backfill/executions


(<hsfs.core.job.Job at 0x19eb950dfa0>, None)