In [224]:
import numpy as np
import numpy.linalg as la
import pandas as pd
from scipy.io.arff import loadarff 
from sklearn.decomposition import PCA
import scipy

In [225]:
raw_data = loadarff('electricity-normalized.arff')
df = pd.DataFrame(raw_data[0])
df['target'] = np.where(df['class'] == b'UP', 1, 0)
df = pd.get_dummies(df.drop(columns='class'))

In [226]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45312 entries, 0 to 45311
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   date       45312 non-null  float64
 1   period     45312 non-null  float64
 2   nswprice   45312 non-null  float64
 3   nswdemand  45312 non-null  float64
 4   vicprice   45312 non-null  float64
 5   vicdemand  45312 non-null  float64
 6   transfer   45312 non-null  float64
 7   target     45312 non-null  int64  
 8   day_b'1'   45312 non-null  uint8  
 9   day_b'2'   45312 non-null  uint8  
 10  day_b'3'   45312 non-null  uint8  
 11  day_b'4'   45312 non-null  uint8  
 12  day_b'5'   45312 non-null  uint8  
 13  day_b'6'   45312 non-null  uint8  
 14  day_b'7'   45312 non-null  uint8  
dtypes: float64(7), int64(1), uint8(7)
memory usage: 3.1 MB


**Date**: date between 7 May 1996 to 5 December 1998. Here normalized between 0 and 1

**Day**: day of the week (1-7)

**Period**: time of the measurement (1-48) in half hour intervals over 24 hours. Here normalized between 0 and 1

**NSWprice**: New South Wales electricity price, normalized between 0 and 1

**NSWdemand**: New South Wales electricity demand, normalized between 0 and 1

**VICprice**: Victoria electricity price, normalized between 0 and 1

**VICdemand**: Victoria electricity demand, normalized between 0 and 1

**transfer**: scheduled electricity transfer between both states, normalized between 0 and 1

In [227]:
first = df.query('date < 0.45')
second = df.query('date > 0.98')
v1 = np.sqrt(np.sum((scipy.linalg.svdvals(first) - scipy.linalg.svdvals(second))**2))
U, S, VT = la.svd(first)
U_s, S_s, VT_s = la.svd(second)
print(v1)

first = df.query('date < 0.45')
second = df.query('date > 0.90')
v2 = np.sqrt(np.sum((scipy.linalg.svdvals(first) - scipy.linalg.svdvals(second))**2))
print(v2)

first = df.query('date < 0.60')
second = df.query('date > 0.60 and date < 0.90')
U, S, VT = la.svd(first)
U_s, S_s, VT_s = la.svd(second)
v3 = np.sqrt(np.sum((scipy.linalg.svdvals(first) - scipy.linalg.svdvals(second))**2))
print(v3)

first = df.query('date < 0.45')
second = df.query('date > 0.90')

190.37229263333418
97.65797924750096
77.65509729067612


In [228]:
np.round(VT[0], 2)

array([-0.24, -0.46, -0.05, -0.37, -0.  , -0.35, -0.38, -0.46, -0.12,
       -0.12, -0.12, -0.12, -0.12, -0.12, -0.11])

In [229]:
np.round(VT_s[0], 2)

array([-0.59, -0.36, -0.04, -0.32, -0.  , -0.31, -0.4 , -0.33, -0.09,
       -0.09, -0.1 , -0.1 , -0.1 , -0.1 , -0.1 ])

In [230]:
print(np.dot(VT_s[0], VT[0]))
print(np.dot(VT_s[1], VT[1]))
print(np.dot(VT_s[2], VT[2]))
print(np.dot(VT_s[3], VT[3]))

0.9196151271197981
0.924274411102636
-0.832632420087531
0.014504586130451452


In [231]:
print(np.dot(VT_s[0], VT[0]))
print(np.dot(VT_s[1], VT[1]))
print(np.dot(VT_s[2], VT[2]))
print(np.dot(VT_s[3], VT[3]))

0.9196151271197981
0.924274411102636
-0.832632420087531
0.014504586130451452


In [189]:
X = first.drop(columns=['target', 'date'])
y = first['target']
weights = la.inv(X.T @ X) @ X.T @ y

In [190]:
test_X = second.drop(columns=['target', 'date'])
test_y = second['target']

In [191]:
weights.index = X.columns

In [192]:
corr_compare = pd.DataFrame({'first':first.corr().round(2)['target'], 'second':second.corr().round(2)['target']})
corr_compare

Unnamed: 0,first,second
date,0.01,-0.05
period,0.17,0.11
nswprice,0.58,0.54
nswdemand,0.34,0.41
vicprice,0.21,0.15
vicdemand,0.15,0.38
transfer,-0.09,-0.17
target,1.0,1.0
day_b'1',0.07,0.01
day_b'2',-0.01,0.14


In [193]:
weights.loc['vicdemand'] *= 1.23
# weights.loc["day_b'2'"] *= 1.23
pred_y = test_X.values @ weights

In [194]:
print(f"train error: {np.mean(np.where(X.values @ weights < 0.5, 0, 1) == y)}")
print(f"test error: {np.mean(np.where(pred_y < 0.5, 0, 1) == test_y)}")

train error: 0.8180423594615994
test error: 0.8333333333333334


### Using resampling approaches at the data drift point 

1. The case where we are doing the data analysis in real time. Retrain the model with more weight on the most recent data?
2. The case where we have SOME of the test values but not all of them. Once again we can train on these with higher weights on these samples. 
3. Use the PCA and K-Means approach?


**How does Sklearn implement their sample weighting?**

- I think one approach is to make a new loss function where it is disjoint? 
- Another approach for this project is making probabilities more uncertain for drifted data and comparing the log-loss of this function compared to the regular one. The theory predicts that higher uncertainity should lead to a better loss. 

In [82]:
pca = PCA(n_components=1)
first_x = pca.fit_transform(first.iloc[:,:4])
second_x = pca.fit_transform(second.iloc[:,:4])
# second_x_test = pca.fit_transform(second_test.iloc[:,:4])