In [1]:
import numpy as np
import pandas as pd
import datetime
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn import linear_model, preprocessing
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import mean_squared_error

#### Global

##### These variables affect all models

In [2]:
testPercent = 0.1 #10% training
predictorColumn = "balance"

###### Time Series and Regression don't play nicely together so we're going to essentially run two seperate ensembles models then combine them at the very end

In [3]:
#TODO: Get the values for the features from the API
transactions = pd.read_csv("Dummy Data/Transactions_2022Q1.csv")
transactions = transactions.sort_values(by="postDate")
transactions.head()

Unnamed: 0,type,id,status,description,amount,account,balance,direction,class,institution,...,enrich,transactionDate,postDate,subClass,links.self,links.account,links.institution,links.connection,subClass.title,subClass.code
349,transaction,11a6f99b-3448-41ec-8514-436ed7f7d78e,posted,WARMUN ROADHOUSE EAST ARM AU,-114.4,49544383-c19c-4c97-b8d1-5322d0faa6af,-38.1,debit,payment,AU00000,...,,,2022-01-01T00:00:00Z,,https://au-api.basiq.io/users/11103cba-4a08-43...,https://au-api.basiq.io/users/11103cba-4a08-43...,https://au-api.basiq.io/institutions/AU00000,,Supermarket and Grocery Stores,411.0
348,transaction,5a6171f6-6d33-4fb7-87ec-05e58dabc853,posted,COLES EXPRESS 6952 HALLS CREEK AU,-326.01,49544383-c19c-4c97-b8d1-5322d0faa6af,-364.11,debit,payment,AU00000,...,,,2022-01-01T00:00:00Z,,https://au-api.basiq.io/users/11103cba-4a08-43...,https://au-api.basiq.io/users/11103cba-4a08-43...,https://au-api.basiq.io/institutions/AU00000,,Fuel Retailing,400.0
347,transaction,d0d696f2-21b7-4b41-9972-af5b557383e4,posted,TARUNDA SUPERMARKET FITZROY CROS AU,-10.04,49544383-c19c-4c97-b8d1-5322d0faa6af,-374.15,debit,payment,AU00000,...,,,2022-01-01T00:00:00Z,,https://au-api.basiq.io/users/11103cba-4a08-43...,https://au-api.basiq.io/users/11103cba-4a08-43...,https://au-api.basiq.io/institutions/AU00000,,Supermarket and Grocery Stores,411.0
346,transaction,0b713af7-6aa0-4945-ae8f-4218b27f14eb,posted,CABLE BEACH CARAVAN CABLE BEACH AU,-131.0,49544383-c19c-4c97-b8d1-5322d0faa6af,-505.15,debit,payment,AU00000,...,,,2022-01-02T00:00:00Z,,https://au-api.basiq.io/users/11103cba-4a08-43...,https://au-api.basiq.io/users/11103cba-4a08-43...,https://au-api.basiq.io/institutions/AU00000,,Accommodation,440.0
345,transaction,6f8f5796-7121-49c2-8977-ba17b862595c,posted,CAFE DAMORE PTY LTD BROOME AU,-70.0,49544383-c19c-4c97-b8d1-5322d0faa6af,-575.15,debit,payment,AU00000,...,,,2022-01-02T00:00:00Z,,https://au-api.basiq.io/users/11103cba-4a08-43...,https://au-api.basiq.io/users/11103cba-4a08-43...,https://au-api.basiq.io/institutions/AU00000,,"Cafes, Restaurants and Takeaway Food Services",451.0


In [4]:
#Data cleaning

#Drop unnecessary
colsToKeep = ['amount', 'balance', 'postDate', 'subClass.title']
df_timeseries = transactions.filter(colsToKeep)

#Keep the category
df_timeseries = pd.get_dummies(df_timeseries, columns=['subClass.title'])

df_timeseries

Unnamed: 0,amount,balance,postDate,subClass.title_Accommodation,subClass.title_Allied Health Services,subClass.title_Amusement and Other Recreation Activities,subClass.title_Automotive Repair and Maintenance,subClass.title_Bakery Product Manufacturing,"subClass.title_Cafes, Restaurants and Takeaway Food Services","subClass.title_Clothing, Footwear and Personal Accessory Retailing",...,"subClass.title_Pubs, Taverns and Bars",subClass.title_Real Estate Services,subClass.title_Recreational Goods Retailing,subClass.title_Regulatory Services,subClass.title_Road Passenger Transport,subClass.title_Specialised Food Retailing,subClass.title_Supermarket and Grocery Stores,subClass.title_Travel Agency and Tour Arrangement Services,subClass.title_Unknown,"subClass.title_Water Supply, Sewerage and Drainage Services"
349,-114.40,-38.10,2022-01-01T00:00:00Z,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
348,-326.01,-364.11,2022-01-01T00:00:00Z,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
347,-10.04,-374.15,2022-01-01T00:00:00Z,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
346,-131.00,-505.15,2022-01-02T00:00:00Z,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
345,-70.00,-575.15,2022-01-02T00:00:00Z,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55,-20.95,1660.08,2022-03-30T00:00:00Z,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
51,-224.21,812.15,2022-03-31T00:00:00Z,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
50,-15.75,796.40,2022-03-31T00:00:00Z,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
49,-17.25,779.15,2022-03-31T00:00:00Z,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
split_idx = int((1-testPercent) * len(df_timeseries))

train_data_timeseries = df_timeseries.iloc[:split_idx]
test_data_timeseries = df_timeseries.iloc[split_idx:]

X_train_timeseries = train_data_timeseries.drop(predictorColumn, axis=1)
X_test_timeseries = test_data_timeseries.drop(predictorColumn, axis=1)
y_train_timeseries = train_data_timeseries[[predictorColumn]]
y_test_timeseries = test_data_timeseries[[predictorColumn]]

In [6]:
#Add new models here, you can seperately find hyperparameters to best optimse them
models_timeseries = [
    SARIMAX(y_train_timeseries, order = (1, 0, 1)),
    SARIMAX(y_train_timeseries, order = (0, 1, 0)),
    SARIMAX(y_train_timeseries, order = (0, 0, 1))
]

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


In [7]:
fits_timeseries = []

for m in models_timeseries:
    fits_timeseries.append(m.fit())

In [8]:
predictions_timeseries = []

for f in fits_timeseries:
    predictions_timeseries.append(f.predict(start=len(y_train_timeseries), end=len(y_train_timeseries)+len(y_test_timeseries)-1))

  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(


In [9]:
mse_timeseries = []

for p in predictions_timeseries:
    mse_timeseries.append(mean_squared_error(y_test_timeseries, p))
    
mse_timeseries

[3464392738.7108736, 3476113595.5576305, 3563891572.821375]

In [10]:
totalMSE_timeseries = sum(mse_timeseries)
weights_timeseries = []

for mse in mse_timeseries:
    weights_timeseries.append((totalMSE_timeseries - mse)/totalMSE_timeseries)
    
weights_timeseries

[0.6701959722629506, 0.669080167535214, 0.6607238602018353]

In [11]:
def finalTimeSeriesModel():
    outcomes = []

    for f in fits_timeseries:
        outcomes.append(f.predict(start=len(y_train),
                       end=len(y_train)+len(y_test)-1))
        
    for i in range(1, len(weights_timeseries)):
        outcomes[i] *= weights_timeseries[i]

    return np.mean(outcomes)

## Regression Section

In [12]:
#Data cleaning
df_reg = transactions

#Keep the category
labelEnc = preprocessing.LabelEncoder()
df_reg['subClass.title'] = labelEnc.fit_transform(df_reg['subClass.title'])

#Drop unnecessary
numeric_columns = df_reg.select_dtypes(include='number').columns
df_reg = df_reg[numeric_columns]

#Remove NaN
df_reg = df_reg.dropna(axis=1)

df_reg

Unnamed: 0,amount,balance,subClass.title
349,-114.40,-38.10,34
348,-326.01,-364.11,11
347,-10.04,-374.15,34
346,-131.00,-505.15,0
345,-70.00,-575.15,5
...,...,...,...
55,-20.95,1660.08,6
51,-224.21,812.15,11
50,-15.75,796.40,27
49,-17.25,779.15,5


In [13]:
split_idx = int((1-testPercent) * len(df_reg))

train_data_reg = df_reg.iloc[:split_idx]
test_data_reg = df_reg.iloc[split_idx:]

X_train_reg = train_data_reg.drop(predictorColumn, axis=1)
X_test_reg = test_data_reg.drop(predictorColumn, axis=1)
y_train_reg = train_data_reg[[predictorColumn]]
y_test_reg = test_data_reg[[predictorColumn]]

In [14]:
#Add new models here, you can seperately find hyperparameters to best optimse them

models_regression = [
    linear_model.LinearRegression(),
    linear_model.LogisticRegression()
]

In [15]:
fits_reg = []
    
for m in models_regression:
    fits_reg.append(m.fit(X_train_reg, y_train_reg))

  y = column_or_1d(y, warn=True)


ValueError: Unknown label type: 'continuous'

In [None]:
predictions_reg = []

for f in fits_reg:
    predictions_reg.append(f.predict(X_train_reg, y_train_reg))

In [None]:
meanSquareErrors = []

for p in predictions:
    meanSquareErrors.append(mean_squared_error(y_test, p))
    
meanSquareErrors

In [None]:
totalMSE = sum(meanSquareErrors)
weights = []

for mse in meanSquareErrors:
    weights.append((totalMSE - mse)/totalMSE)
    
weights

In [None]:
def finalRegressionModel():
    outcomes = []

    for f in fits:
        outcomes.append(f.predict(start=len(y_train),
                       end=len(y_train)+len(y_test)-1))
        
    for i in range(1, len(weights)):
        outcomes[i] *= weights[i]

    return np.mean(outcomes)

### Final Combination

In [None]:
#TODO