In [178]:
import pandas as pd

df = pd.read_csv(
    "data/merged_dataset.csv", sep=";", float_precision="round_trip", low_memory=False
)

base_columns = [
    "city",
    "day_datetime",
    "hour_datetime",
    "hour_datetimeEpoch",
    "hour_temp",
    "hour_precip",
    "hour_windspeed",
    "hour_winddir",
    "hour_cloudcover",
    "hour_conditions",
    "event_start_hour",
]
vector_columns = list(df.columns)[60:160]
total_columns = base_columns + vector_columns

procDF = df[
    total_columns
]

# removing duplicate rows
procDF = procDF.drop_duplicates(subset=["city", "day_datetime", "hour_datetime"])

# boolean column indicating whether there is an alarm right now (shorthand for checking event_start_hour)
procDF.insert(len(procDF.columns) - 1, "is_alarm", False)
procDF.loc[procDF["event_start_hour"].notnull(), "is_alarm"] = True

# Feature 1:
# calculating in how many cities in general there are alarms in the given moment
event_counts = procDF.groupby("hour_datetimeEpoch")["event_start_hour"].count()
procDF["global_alarm_count"] = procDF["hour_datetimeEpoch"].map(event_counts)

# Feature 2:
groupedByHrs = procDF.groupby("city").rolling(window=24)["event_start_hour"].count()
procDF["events_last_24h"] = groupedByHrs.reset_index(0, drop=True)
procDF.loc[procDF["events_last_24h"].isnull(), "events_last_24h"] = 0
procDF["events_last_24h"] = procDF["events_last_24h"].astype("int")

# mapping string columns to respective index values
procDF['city'] = procDF['city'].astype('category')
procDF['city'] = procDF['city'].cat.reorder_categories(procDF['city'].unique(), ordered=True)
procDF['city'] = procDF['city'].cat.codes

procDF['day_datetime'] = procDF['day_datetime'].astype('category')
procDF['day_datetime'] = procDF['day_datetime'].cat.reorder_categories(procDF['day_datetime'].unique(), ordered=True)
procDF['day_datetime'] = procDF['day_datetime'].cat.codes

procDF['hour_datetime'] = procDF['hour_datetime'].astype('category')
procDF['hour_datetime'] = procDF['hour_datetime'].cat.reorder_categories(procDF['hour_datetime'].unique(), ordered=True)
procDF['hour_datetime'] = procDF['hour_datetime'].cat.codes

procDF['hour_conditions'] = procDF['hour_conditions'].astype('category')
procDF['hour_conditions'] = procDF['hour_conditions'].cat.reorder_categories(procDF['hour_conditions'].unique(), ordered=True)
procDF['hour_conditions'] = procDF['hour_conditions'].cat.codes

# fixing empty values
procDF.loc[procDF["hour_precip"].isnull(), "hour_precip"] = 0

# dropping unnecessary columns
procDF = procDF.drop(columns=['event_start_hour'])
procDF = procDF.drop(columns=['hour_datetimeEpoch'])

procDF.to_csv("data/data.csv", index=False)
