In [2]:
%matplotlib inline

import os
from pathlib import Path

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

DATA_RAW = Path(os.getcwd()) / os.pardir / "data" / "raw" / "anomaly"
DATA_FINAL = Path(os.getcwd()) / os.pardir / "data" / "final" / "anomaly"

# Series1 (Train)

This is all of the consumption data for the meters in the buildings.

 - Remove 'normal_abnormal_tag' column
 - Write as properly comma separated

In [3]:
dfs = []

for f in (DATA_RAW / 'Series_Given1').glob('*.csv'):
    dfs.append(pd.read_csv(f, parse_dates=['Timestamp'], sep=';'))
    
given1 = pd.concat(dfs).sort_values(['meter_id', 'Timestamp'])

print(given1.shape)
given1.head()

(43668606, 4)


Unnamed: 0,meter_id,Timestamp,Values,normal_abnormal_tag
2532,2,2015-06-11 00:00:00,2035.0,
2543,2,2015-06-11 00:15:00,2074.0,
2544,2,2015-06-11 00:30:00,2062.0,
2525,2,2015-06-11 00:45:00,2025.0,
2534,2,2015-06-11 01:00:00,2034.0,


In [4]:
# check if overlap with consumption challenge
consumption = pd.read_csv(Path("../data/final/consumption/public/train.csv"), index_col=0)
consumption.head()

  mask |= (ar1 == a)


Unnamed: 0_level_0,SiteId,Timestamp,Value
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
744519,1,2014-09-03 00:00:00,909655.5
7627564,1,2014-09-04 00:00:00,1748273.0
7034705,1,2014-09-05 00:00:00,
5995486,1,2014-09-06 00:00:00,
7326510,1,2014-09-07 00:00:00,


In [5]:
sites = consumption.SiteId.unique()

In [6]:
meters = given1.meter_id.unique()
meters

in_both = set()
for m in meters:
    if isinstance(m, str):
        m = int(m.split("_")[0])
        
    if m in sites:
        in_both.add(m)
        
in_both

{2, 38, 234}

In [7]:
given1.drop(['normal_abnormal_tag'], axis=1, inplace=True)

In [8]:
given1.to_csv(DATA_FINAL / "public" / "train.csv")

In [9]:
given1.head()

Unnamed: 0,meter_id,Timestamp,Values
2532,2,2015-06-11 00:00:00,2035.0
2543,2,2015-06-11 00:15:00,2074.0
2544,2,2015-06-11 00:30:00,2062.0
2525,2,2015-06-11 00:45:00,2025.0
2534,2,2015-06-11 01:00:00,2034.0


# Test Set (Given2, Secret)

 - All are contained in Secret, no need to use Given2

In [26]:
dfs = []

for f in (DATA_RAW / 'Series_Secret').glob('*.csv'):
    dfs.append(pd.read_csv(f, parse_dates=['Timestamp'], sep=';'))
    
secret = pd.concat(dfs).sort_values(['meter_id', 'Timestamp'])

print(secret.shape)
secret.head()

(402971, 6)


Unnamed: 0,meter_id,Timestamp,Values,site_id,normal_abnormal_tag,given
0,234_203,2013-11-03 00:00:00,,203,normal,secret
1,234_203,2013-11-03 00:30:00,,203,normal,secret
2,234_203,2013-11-03 01:00:00,,203,normal,secret
3,234_203,2013-11-03 01:30:00,,203,normal,secret
4,234_203,2013-11-03 02:00:00,,203,normal,secret


In [27]:
secret.meter_id.value_counts()

38_9686    254547
334_61      75224
234_203     73200
Name: meter_id, dtype: int64

In [28]:
secret.normal_abnormal_tag.value_counts()

normal                                                                                                 369778
day with abnormal consumption                                                                           25283
abnormal consumption regarding the outside temperature                                                   4745
abnormal consumption during unoccupied hours                                                             2255
day with abnormal consumption+abnormal consumption regarding the outside temperature                      672
abnormal consumption during unoccupied hours+abnormal consumption regarding the outside temperature       199
punctual anomaly                                                                                           39
Name: normal_abnormal_tag, dtype: int64

In [29]:
secret.meter_id.isin(given1.meter_id.unique()).all()

True

In [30]:
pd.isnull(secret.Values).sum()

336

In [31]:
secret.dropna(inplace=True)

In [32]:
rng = np.random.RandomState(900)
idx = rng.permutation(np.arange(secret.shape[0]))
idx

secret.index = pd.Series(idx, name='obs_id')

secret.head()

Unnamed: 0_level_0,meter_id,Timestamp,Values,site_id,normal_abnormal_tag,given
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
171227,234_203,2013-11-03 19:00:00,11636.0,203,normal,secret
177020,234_203,2013-11-03 19:30:00,11106.0,203,normal,secret
165375,234_203,2013-11-03 20:00:00,10957.0,203,normal,secret
207176,234_203,2013-11-03 20:30:00,11157.0,203,normal,secret
353253,234_203,2013-11-03 21:00:00,12196.0,203,normal,secret


In [33]:
secret['is_abnormal'] = (secret.normal_abnormal_tag != 'normal')

In [34]:
test = secret[['meter_id', 'Timestamp', 'is_abnormal']]

submission_format = test.copy()
submission_format.is_abnormal = False

public_subset = (secret.given != 'secret').to_frame()

In [35]:
test.head()

Unnamed: 0_level_0,meter_id,Timestamp,is_abnormal
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
171227,234_203,2013-11-03 19:00:00,False
177020,234_203,2013-11-03 19:30:00,False
165375,234_203,2013-11-03 20:00:00,False
207176,234_203,2013-11-03 20:30:00,False
353253,234_203,2013-11-03 21:00:00,False


In [36]:
submission_format.head()

Unnamed: 0_level_0,meter_id,Timestamp,is_abnormal
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
171227,234_203,2013-11-03 19:00:00,False
177020,234_203,2013-11-03 19:30:00,False
165375,234_203,2013-11-03 20:00:00,False
207176,234_203,2013-11-03 20:30:00,False
353253,234_203,2013-11-03 21:00:00,False


In [37]:
public_subset.head()

Unnamed: 0_level_0,given
obs_id,Unnamed: 1_level_1
171227,False
177020,False
165375,False
207176,False
353253,False


In [38]:
submission_format.to_csv(DATA_FINAL / "public" / "submission_format.csv")
test.to_csv(DATA_FINAL / "private" / "test.csv")
public_subset.to_csv(DATA_FINAL / "private" / "public_subset.csv")

In [39]:
print(submission_format.head().to_html())

<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>meter_id</th>
      <th>Timestamp</th>
      <th>is_abnormal</th>
    </tr>
    <tr>
      <th>obs_id</th>
      <th></th>
      <th></th>
      <th></th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>171227</th>
      <td>234_203</td>
      <td>2013-11-03 19:00:00</td>
      <td>False</td>
    </tr>
    <tr>
      <th>177020</th>
      <td>234_203</td>
      <td>2013-11-03 19:30:00</td>
      <td>False</td>
    </tr>
    <tr>
      <th>165375</th>
      <td>234_203</td>
      <td>2013-11-03 20:00:00</td>
      <td>False</td>
    </tr>
    <tr>
      <th>207176</th>
      <td>234_203</td>
      <td>2013-11-03 20:30:00</td>
      <td>False</td>
    </tr>
    <tr>
      <th>353253</th>
      <td>234_203</td>
      <td>2013-11-03 21:00:00</td>
      <td>False</td>
    </tr>
  </tbody>
</table>


In [41]:
!head -n 6 {DATA_FINAL / "public" / "submission_format.csv"}

obs_id,meter_id,Timestamp,is_abnormal
171227,234_203,2013-11-03 19:00:00,False
177020,234_203,2013-11-03 19:30:00,False
165375,234_203,2013-11-03 20:00:00,False
207176,234_203,2013-11-03 20:30:00,False
353253,234_203,2013-11-03 21:00:00,False


In [40]:
submission_format.meter_id.value_counts()

38_9686    254547
334_61      75118
234_203     72970
Name: meter_id, dtype: int64

# Metadata

 - Use comma separated instead of semicolon

In [17]:
meta_data = pd.read_csv(DATA_RAW / 'Metadata' / 'meter-meta.csv', sep=';', index_col='meter_id')
meta_data.to_csv(DATA_FINAL / "public" / "metadata.csv")

In [18]:
meta_data.head()

Unnamed: 0_level_0,site_id,meter_description,units,surface,activity
meter_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
234_203,234_203,virtual main,Wh,5750.0,office
863,234_203,main meter,Wh,5750.0,office
869,234_203,other,Wh,5750.0,office
872,234_203,elevators,Wh,5750.0,office
875,234_203,Lighting,Wh,5750.0,office


In [23]:
meta_data.activity.value_counts()

office        108
laboratory     34
general        32
restaurant     13
Name: activity, dtype: int64

# Holidays

In [19]:
holidays = pd.read_csv(DATA_RAW / 'Holidays' / 'holidays.csv', sep=';')

holidays.index.name = 'row_id'

holidays.to_csv(DATA_FINAL / "public" / "holidays.csv")

In [20]:
holidays.head()

Unnamed: 0_level_0,Date,Holiday,site_id
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2016-01-01,New year,38
1,2016-03-28,Easter Monday,38
2,2016-05-01,Labour Day,38
3,2016-05-05,Ascension Thursday,38
4,2016-05-08,Victory in Europe Day,38


# Weather

 - Coalese into single data frame

In [21]:
dfs = []

for f in (DATA_RAW / 'Weather').glob('*.csv'):
    dfs.append(pd.read_csv(f, sep=';', parse_dates=['Timestamp']))
    
weather = pd.concat(dfs).sort_values(['site_id', 'Timestamp'])
weather.index.name = 'row_id'

print(weather.shape)

(391628, 4)


In [22]:
weather.head()

Unnamed: 0_level_0,Timestamp,Temperature,Distance,site_id
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
42628,2012-01-01 01:00:00,3.9,11.902932,38
42629,2012-01-01 02:00:00,4.1,11.902932,38
42630,2012-01-01 03:00:00,4.2,11.902932,38
42631,2012-01-01 04:00:00,4.1,11.902932,38
42632,2012-01-01 05:00:00,4.3,11.902932,38


In [51]:
weather.to_csv(DATA_FINAL / "public" / "weather.csv")

# Json Args

In [52]:
import json
with open(DATA_FINAL/"public"/"read_kwargs.json", "w+") as f:
    json.dump(dict(parse_dates=["Timestamp"]), f)

# Metric

In [53]:
%load_ext autoreload
import sys

sys.path.append(str(Path(os.getcwd() + "/../metric/").resolve()))

%autoreload 1
%aimport metrics

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [63]:
assert 0.0 == metrics.weighted_precision_recall(submission_format.values, submission_format.values)
assert 1.0 == metrics.weighted_precision_recall(test.values, test.values)

In [66]:
perfectly_wrong = test.copy()
perfectly_wrong.is_abnormal = ~perfectly_wrong.is_abnormal

assert 0.0 == metrics.weighted_precision_recall(test.values, perfectly_wrong.values)
assert 0.0 == metrics.weighted_precision_recall(test.values, submission_format.values)

In [67]:
all_abnormal = submission_format.copy()
all_abnormal.is_abnormal = True

assert 1.0 == metrics.weighted_precision_recall(all_abnormal.values, all_abnormal.values)
metrics.weighted_precision_recall(test.values, all_abnormal.values)

0.27290418295710833