# Import Data

In [None]:
!pip install python-dotenv eland

In [None]:
import eland as ed
import csv, json, os
import tensorflow as tf
import pandas as pd
import seaborn as sns
import numpy as np
from tensorflow import transpose, linalg, tensordot, dtypes, convert_to_tensor, keras
from tensorflow.keras.datasets import mnist
from dotenv import load_dotenv
import matplotlib.pyplot as plt
from elasticsearch import Elasticsearch, NotFoundError, helpers
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load env variables

In [None]:
DATA_DIR = './data'
EL_INDEX = 'heart'

column_target = 'target'
tf_model_name= 'tfmodel'
epoch_amount=100
columns_to_convert = [
    # [EXPERIMENTAL] Add on this array all the columns you would like to convert to string
    # 'example1',
    # 'example2
    'thal'
]

load_dotenv() 

# Connect to ElasticSearch

In [None]:
es = Elasticsearch(
    "http://elk-and-jupyter-elasticsearch-1:9200",
    basic_auth=("elastic", os.getenv('ELASTIC_PASSWORD')),
)
print(es.info())

# Check and Indexes data is on Elastic

In [None]:
def insert_data_into_el():
    os.chdir('/home/jovyan/')
    for filename in os.listdir(DATA_DIR):
        if filename.endswith(".csv"):
            os.chdir(DATA_DIR)
            with open(filename) as f:
                ed.csv_to_eland(
                    filename,
                    es_client=es,
                    es_dest_index=EL_INDEX,
                    es_if_exists='replace',
                    es_refresh=True
                ) 
                #reader = csv.DictReader(f)
                #helpers.bulk(es, reader, index=EL_INDEX)
                os.chdir('../')
        else: 
            if filename.endswith(".json"):
                os.chdir(DATA_DIR)
                with open(filename,'r') as open_file:
                    helpers.bulk(es, json.load(open_file), index=EL_INDEX)
                    os.chdir('../')
    print('Done!')

if es.indices.exists(index=EL_INDEX)==False:
    print('Index Not found. Adding to ElasticSearch...')
    insert_data_into_el()
else:
    print('Data is OK')

# Prepare dataset

In [None]:
df = ed.DataFrame(es, es_index_pattern=EL_INDEX)
#print(df.dtypes)
pandas_df = ed.eland_to_pandas(df)

def convert_string_column_to_numbers_tag(column_f):
    pandas_df[column_f] = pd.Categorical(pandas_df[column_f])
    pandas_df[column_f] = pandas_df.thal.cat.codes
    # df_f[column_f] = pandas_df[column_f]

for column in columns_to_convert:
    convert_string_column_to_numbers_tag(column)

target = pandas_df.pop(column_target)

X_train, X_test, y_train, y_test = train_test_split(pandas_df.values, target.values, test_size=.2)

dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))

for feat, targ in dataset.take(5):
  print ('Features: {}, Target: {}'.format(feat, targ))

for column in columns_to_convert:
    tf.constant(pandas_df[column])

# Build and Compile Model

In [None]:
train_dataset = dataset.shuffle(len(pandas_df)).batch(1)

def get_compiled_model():
  model = tf.keras.Sequential([
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dense(1)
  ])

  model.compile(optimizer='adam',
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=['accuracy'])
  return model

model = get_compiled_model()

# Fit, Predict and Evaluate

In [None]:
history = model.fit(train_dataset, epochs=epoch_amount)

In [None]:
y_hat = model.predict(X_test)
y_hat = [0 if val < 0.5 else 1 for val in y_hat]

In [None]:
accuracy_score(y_test, y_hat)

# Saving the model

In [None]:
path_to_model = './model/'+tf_model_name
model.save(path_to_model)

# Graph time

In [None]:
# Convert the dataset into a 2D array of shape 18623 x 784
x = convert_to_tensor(np.reshape(X_train, (X_train.shape[0], -1)),
                      dtype=dtypes.float32)
# Eigen-decomposition from a 784 x 784 matrix
eigenvalues, eigenvectors = linalg.eigh(tensordot(transpose(x), x, axes=1))
# Project the data to eigenvectors
x_pca = tensordot(x, eigenvectors, axes=1)

fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(15,6))

# Create pandas DataFrame
df_history = pd.DataFrame(history.history)

# left plot
scatter = ax[0].scatter(x_pca[:, -1], x_pca[:, -2], s=5, c=y_train)
legend_plt = ax[0].legend(*scatter.legend_elements(),
                         loc="lower left", title="Digits")
ax[0].add_artist(legend_plt)
ax[0].grid()
ax[0].set_title('First Two Dimensions of Projected Data After Applying PCA')

# middle plot
training_graph = sns.lineplot(data=df_history["loss"], ax=ax[1], color='orange')
training_graph.set_xlabel('Epochs')
training_graph.set_ylabel('Loss')
ax[1].grid()
ax[1].set_title('Loss Overtime')

# right plot
accuracy_plot = sns.lineplot(data=df_history["accuracy"], ax=ax[2], color='darkcyan')
accuracy_plot.set_xlabel('Epochs')
accuracy_plot.set_ylabel('Accuracy percentage')
ax[2].set_title('Accuracy')
ax[2].grid()
plt.show()
print('Accuracy score:', accuracy_score(y_test, y_hat))