<a href="https://colab.research.google.com/github/BHill96/AdvancedBigDataAnalytics/blob/master/Final/FinalCode.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Download necessary packages
!pip install transformers
!pip install xelatex
# This takes a couple minutes
!apt-get install texlive texlive-xetex texlive-latex-extra pandoc

In [0]:
# Download Code
! git clone https://github.com/BHill96/AdvancedBigDataAnalytics.git

In [0]:
# Move to Final directory
cd AdvancedBigDataAnalytics/Final

In [0]:
import XLNetFed
from pandas import read_csv
import pandas as pd
import numpy as np
from google.colab import files

In [0]:
%%time
# # # # # # # # # #
# numSims # hours #
#    1    #   1   #
#    10   #   10  #
# # # # # # # # # #
DIR = 'Data/'
results = []
numSims = 1
text = read_csv(DIR+'FedTextData.csv', names=['Date','Text'])
for fileName in ['GDPC1.csv', 'MICH.csv', 'liborfinal.csv', 'UNRATENSA.csv']:
    # Load data and calculate sentiment
    if fileName == 'liborfinal.csv':
        mType = 'Daily'
    else:
        mType = 'Monthly'
    data = XLNetFed.CalcSentiment(text, read_csv(DIR+fileName), metricType=mType)
    for ml, bs in zip([64, 128, 256], [48, 24, 8]):
        # Create masks
        inpt, attMsk = XLNetFed.TextPrep(data, MAX_LEN=ml)
        for epoch in [10, 15, 20]:
            # Print simulation info
            print('File: {0}\nMAX_LEN: {1}\nbatch_size: {2}\nEpochs: {3}'.format(fileName, ml, bs, epoch))
            for i in range(0,numSims):
                print('{0} of {1}:'.format(i, numSims))
                _, _, acc = XLNetFed.Train(inpt, attMsk, list(data.Econ_Perf), batch_size=bs, epochs=epoch)
                results.append((fileName, ml, bs, epoch, acc))
    # Side affect of CalcSentiment
    text = text.drop(labels='index', axis=1)

results = pd.DataFrame(results, columns=['File','MAX_LEN','batch_size','Epochs','Accuracy'])
results = results.groupby(['File','MAX_LEN','batch_size','Epochs']).agg([np.mean, np.std]).reset_index()
results.to_csv(DIR+'SentimentTests.csv')
files.download('Data/SentimentTests.csv') 

In [0]:
import matplotlib.pyplot as plt
"""
import matplotlib as mpl
params = {'pgf.texsystem': 'xelatex'}
mpl.rcParams.update(params)
"""
import matplotlib.ticker as ticker
from mpl_toolkits.axes_grid.parasite_axes import SubplotHost
from copy import deepcopy

In [0]:
x = list(map(str, results.Epochs))
y = results.Accuracy['mean']
if numSims == 1:
    yerr = None
else:
    yerr = results.Accuracy['std']

FileLabels = np.unique(results.File)
MLLabels = list(map(str, np.unique(results.MAX_LEN)))
BSLabels = list(map(str, np.unique(results.batch_size)))
MLBSLabels = ['('+ml+', '+bs+')' for ml, bs in zip(MLLabels, BSLabels[::-1])]
tmpLabel = deepcopy(MLBSLabels)
for _ in range(0, len(FileLabels)-1):
    for mlbs in tmpLabel:
        MLBSLabels.append(mlbs)

fig1 = plt.figure(figsize=(15,15))
ax1 = SubplotHost(fig1, 111)
fig1.add_subplot(ax1)
     
# Epochs
ax1.bar(np.arange(0, len(x), step=1) ,y ,yerr=yerr)
ax1.set_xticks(results.index)
ax1.set_xticklabels(x)
ax1.yaxis.set_label_text('Accuracy')

# MAX_LEN and Batch size
ax2 = ax1.twiny()
offset = 0, -25 # Position of the second axis
new_axisline = ax2.get_grid_helper().new_fixed_axis
ax2.axis["bottom"] = new_axisline(loc="bottom", axes=ax2, offset=offset)
ax2.axis["top"].set_visible(False)
ax2.set_xticks([0,1])
ax2.xaxis.set_major_formatter(ticker.NullFormatter())
ax2.xaxis.set_minor_locator(ticker.FixedLocator(np.linspace(0.08, 1, len(MLBSLabels)+1)))
ax2.xaxis.set_minor_formatter(ticker.FixedFormatter(MLBSLabels))

# Files
ax3 = ax1.twiny()
offset = 0, -50 # Position of the second axis
new_axisline = ax3.get_grid_helper().new_fixed_axis
ax3.axis["bottom"] = new_axisline(loc="bottom", axes=ax3, offset=offset)
ax3.axis["top"].set_visible(False)
ax3.set_xticks([0,1])
ax3.xaxis.set_major_formatter(ticker.NullFormatter())
ax3.xaxis.set_minor_locator(ticker.FixedLocator(np.linspace(0.16, 1.08, len(FileLabels)+1)))
ax3.xaxis.set_minor_formatter(ticker.FixedFormatter(FileLabels))

ax1.grid(1, axis='y')
plt.title('Accuracy per File, Max String Length, Batch Size, and Epochs\n (number of tests per parameter set: {0})'.format(numSims))
plt.savefig(fname='ParameterSims.pdf')
# plt.savefig(fname='ParameterSims.pgf')
plt.show()
files.download('ParameterSims.pdf')
# files.download('ParameterSims.pgf') 