In [81]:
import pandas as pd
from pathlib import Path
import seaborn as sns
import tensorflow as tf
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np

%matplotlib inline

In [82]:
fraud_reporting = pd.read_csv(Path("Resources/combined_data.csv"), thousands=',', index_col='Year')
fraud_reporting.drop(columns="Quarter", inplace=True)


In [83]:
fraud_reporting.head()

Unnamed: 0_level_0,Fraud Reporting Count,Net Operating Income(Billions),Lagged Fraud Reporting Count,Lagged Net Operating Income(Billions)
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014,82364,36.8,,
2014,95516,39.8,82364.0,36.8
2014,94084,38.1,95516.0,39.8
2014,90868,36.0,94084.0,38.1
2015,90566,39.0,90868.0,36.0


## Random reproducibility used for protyping to run multiple experiments to evaluate the model. 

In [84]:
from numpy.random import seed

seed(1)
from tensorflow import random

random.set_seed(2)

#### Feature `X` and Target `y` Data

Identified quarterly timeframe through the use of `window_data()` function, to create the features set `X` and the target vector `y`. We defined the window size '4' to represent the quarter timeframe and use the column of the Fraud Reporting Count for feature and target column; to allow the model to predict the Quartertly Number Fraud Instances and Net Income. 

In [85]:
def window_data(df, window, feature_col_number, target_col_number):
   
    X = []
    y = []
    for i in range(len(df) - window):
        features = df.iloc[i : (i + window), feature_col_number]
        target = df.iloc[(i + window), target_col_number]
        X.append(features)
        y.append(target)
    return np.array(X), np.array(y).reshape(-1, 1)

In [86]:
# Define the window size
window_size = 4

# Set the index of the feature and target columns
feature_column = [0, 1]
target_column = 0

# Create the features (X) and target (y) data using the window_data() function.
X, y = window_data(fraud_reporting, window_size, feature_column, target_column)

# Print a few sample values from X and y
print (f"X sample values:\n{X[:3]} \n")
print (f"y sample values:\n{y[:3]}")

X sample values:
[[[8.2364e+04 3.6800e+01]
  [9.5516e+04 3.9800e+01]
  [9.4084e+04 3.8100e+01]
  [9.0868e+04 3.6000e+01]]

 [[9.5516e+04 3.9800e+01]
  [9.4084e+04 3.8100e+01]
  [9.0868e+04 3.6000e+01]
  [9.0566e+04 3.9000e+01]]

 [[9.4084e+04 3.8100e+01]
  [9.0868e+04 3.6000e+01]
  [9.0566e+04 3.9000e+01]
  [9.6365e+04 4.2600e+01]]] 

y sample values:
[[ 90566]
 [ 96365]
 [101745]]


# Split the Data Between Training and Testing Sets 

In [87]:
split = int(0.7 * len(X))

X_train = X[: split]

X_test = X[split:]

y_train = y[: split]
y_test = y[split:]

# Scaled Data with MinMaxScaler 
Used the MinMaxScaler to reshape the data for numpy array and trained and test the X and y target. 

In [88]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

# Scale and reshape for X

num_instances, num_time_steps, num_features = X_train.shape
X_train_reshaped = np.reshape(X_train, newshape=(-1, num_features))
scaler.fit(X_train_reshaped)
X_train_scaled = scaler.transform(X_train_reshaped)
X_train_scaled = np.reshape(X_train_scaled, newshape=(num_instances, num_time_steps, num_features))
X_train_scaled

num_instances, num_time_steps, num_features = X_test.shape
X_test_reshaped = np.reshape(X_test, newshape=(-1, num_features))

X_test_scaled = scaler.transform(X_test_reshaped)
X_test_scaled = np.reshape(X_test_scaled, newshape=(num_instances, num_time_steps, num_features))
X_test_scaled

# Scale and reshape for y 

scaler.fit(y_train)
y_train_scaled = scaler.transform(y_train)

y_test_scaled = scaler.transform(y_test)



## Tensorflow Keras Models for Sequential for LSTEM 


In [90]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

In [119]:
# Define the LSTM RNN model.
model = Sequential()

# Initial model setup
number_units = 6
dropout_fraction = 0.2

# Layer 1
model.add(LSTM(
    units=number_units,
    return_sequences=True,
    input_shape=(X_train_scaled.shape[1], X_train_scaled.shape[2]))
    )
model.add(Dropout(dropout_fraction))

# Layer 2
model.add(LSTM(units=number_units, return_sequences=True))
model.add(Dropout(dropout_fraction))

# Layer 3
model.add(LSTM(units=number_units))
model.add(Dropout(dropout_fraction))

# Output layer
model.add(Dense(1))

# Compling th LSTM Model 
We combined the model using the adam optimizer and the MSE (mean_square_error) as the loss function. The objective is the evaluate the differences between the true and predicated values. 

In [120]:
model.compile(optimizer="adam", loss="mean_squared_error")

In [121]:
model.summary()

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_25 (LSTM)              (None, 4, 6)              216       
                                                                 
 dropout_24 (Dropout)        (None, 4, 6)              0         
                                                                 
 lstm_26 (LSTM)              (None, 4, 6)              312       
                                                                 
 dropout_25 (Dropout)        (None, 4, 6)              0         
                                                                 
 lstm_27 (LSTM)              (None, 6)                 312       
                                                                 
 dropout_26 (Dropout)        (None, 6)                 0         
                                                                 
 dense_8 (Dense)             (None, 1)               

# Training the Model 
We trained the model used 15 epochs and a batch siez equal to 90. 

In [122]:
model.fit(X_train_scaled, y_train_scaled, epochs=10, shuffle=False, batch_size=19, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x24130a78e08>

In [124]:
model.evaluate(X_test_scaled, y_test_scaled, verbose=0)

1.739909052848816

In [126]:
predicted = model.predict(X_test)



In [128]:
predicted_fraud_reports = scaler.inverse_transform(predicted)
real_fraud_reports = scaler.inverse_transform(y_test.reshape(-1, 1))

In [129]:
pd.DataFrame({
    "Actual": real_fraud_reports.ravel(),
    "Predicted": predicted_fraud_reports.ravel()
}, index = fraud_reporting.index[-len(real_fraud_reports): ]) 

# Show the DataFrame's head
fraud_reporting.head()

Unnamed: 0_level_0,Fraud Reporting Count,Net Operating Income(Billions),Lagged Fraud Reporting Count,Lagged Net Operating Income(Billions)
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014,82364,36.8,,
2014,95516,39.8,82364.0,36.8
2014,94084,38.1,95516.0,39.8
2014,90868,36.0,94084.0,38.1
2015,90566,39.0,90868.0,36.0
