# Load Libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pathlib import Path
import csv

# For getting trained word embeddings
from gensim.models import FastText

# getting rid of Tensorflow warnings
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

# used for deep learning
import tensorflow as tf
import tensorflow.keras.layers as tfl

# Data Preprocessing
### Load Data
Defines function that load the data and returns it as tensors of latitude, longitude and all tweets for a twitter user. Also defines function that optimizes datasets.

In [None]:
# Code for optimizing dataset performance
AUTOTUNE = tf.data.AUTOTUNE
def configure_dataset(dataset):
  return dataset.cache().prefetch(buffer_size=AUTOTUNE)

# Load file and return datasets for text and location and their combined training dataset
def loadData(data_path):
    df = pd.read_csv(data_path, delimiter='\t', encoding='utf-8', names=['user', 'lat', 'lon', 'text'],
                     quoting=csv.QUOTE_NONE, on_bad_lines='skip')
    # Get latitude, longitude and tweets separately
    lat = df['lat']
    lon = df ['lon']
    text = df ['text']
    
    # Remove ||| separator and replace with space
    text = text.str.lstrip(" |")
    text = text.str.replace(" \|\|\| ", " ", regex=True)
    
    # Convert all to tensor and return
    lat = tf.convert_to_tensor(lat)
    lon = tf.convert_to_tensor(lon)
    text = tf.convert_to_tensor(text)
    
    # Convert to text dataset and location dataset respectively
    text_ds = tf.data.Dataset.from_tensor_slices(text)
    location_ds = tf.data.Dataset.from_tensor_slices((lat, lon))
    trainingData = tf.data.Dataset.zip((text_ds, location_ds))

    # Configure datasets for performance
    text_ds = configure_dataset(text_ds)
    location_ds = configure_dataset(location_ds)
    trainingData = configure_dataset(trainingData)
    
    # Return datasets
    return text_ds, location_ds, trainingData

Tensors are then turned into datasets that can be split into batches.

In [None]:
# TODO: use training set
data_path = Path('/kaggle/input/geoworld/user_info.dev')

# Get datasets from data file
text_ds, location_ds, trainingData = loadData(data_path)

### Text Vectorization
Now we have to convert each string of text in the data into an array of integers where every integer is an index that belongs to a word. This is done so that we can input data into our model.

In [None]:
# Turns each string of text into an array of integers
MAX_SEQUENCE_LENGTH = 250

# Create TextVectorization layer and build vocabulary from dataset
# This layer takes an input array of strings of shape (batch_size) and outputs arrays of shape (batch_size, MAX_SEQUENCE_LENGTH)
vectorize_layer = tfl.TextVectorization(output_sequence_length=MAX_SEQUENCE_LENGTH)
vectorize_layer.adapt(text_ds.batch(128))

# Save vocabulary and vocabulary size
vocab = vectorize_layer.get_vocabulary()
VOCAB_SIZE = len(vocab)

### Word Embeddings
Next, we convert each integer in our data into a unique word embedding.  
The following function gets embeddings for all the words in our dataset's vocabulary and returns it as a matrix.

In [None]:
# Gets embeddings for words in the dataset. This assumes that TextVectorization has already built a vocabulary.
def get_embeddings():
    # Create lexicons of the words 1->and
    word_index = dict(zip(vocab, range(VOCAB_SIZE)))
    
    ft_model = FastText.load('/kaggle/input/testing-notebook/embeddings.model')
    trained_embeddings = ft_model.wv
    
    embedding_dim = 100
    hits = 0
    misses = 0
    
    # Prepare embedding matrix by matching every word in embeddings with vectorizer
    embedding_matrix = np.zeros((VOCAB_SIZE, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = trained_embeddings.get_vector(word, norm=True)
        
        if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        
            embedding_matrix[i] = embedding_vector
            hits = hits + 1
        else:
            misses = misses + 1
    print("Converted %d words (%d misses)" % (hits, misses))
    return embedding_matrix

In [None]:
# Turns array of integers into an array of embeddings
# Embedding Layer takes input array of shape (batch_size, MAX_SEQUENCE_LENGTH) and outputs array of shape (batch_size, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM)
EMBEDDING_DIM = 100
embedding_matrix = get_embeddings()

# Create Model

In [None]:
# Embedding layer
embedding_layer = tfl.Embedding(input_dim = VOCAB_SIZE, 
                                output_dim = EMBEDDING_DIM, 
                                input_length=MAX_SEQUENCE_LENGTH,
                                embeddings_initializer = tf.keras.initializers.Constant(embedding_matrix),
                                trainable=False)

# Make model
inputs = tfl.Input(shape=(None,), dtype=tf.string)
x = vectorize_layer(inputs)
x = embedding_layer(x)
x = tfl.LSTM(1024, use_bias=True)(x)

lat_pred = tfl.Dense(1, name='latitude')(x)
lon_pred = tfl.Dense(1, name='longitude')(x)

model = tf.keras.Model(inputs = inputs, outputs = [lat_pred, lon_pred])
model.summary()
tf.keras.utils.plot_model(model, "multi_input_and_output_model.png", show_shapes=True)

## Compile Model

In [None]:
import tensorflow.keras.backend as K

def tilted_loss(q,y,f):
    e = (y-f)
    return K.mean(K.maximum(q*e, (q-1)*e), axis=-1)

lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    0.01, decay_steps=1000, decay_rate=0.96, staircase=True
  )
optimizer = tf.keras.optimizers.Adagrad(learning_rate=lr_schedule)

quantile = 0.5
model.compile(loss=lambda y,f: tilted_loss(quantile,y,f), optimizer=optimizer, metrics=['accuracy'])

# Train Model

In [None]:
# cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath='cp.ckpt',
#                                                  save_weights_only=False,
#                                                  verbose=1)

model.fit(trainingData.batch(512), epochs=4)

In [None]:
# get test set
data_path = Path('/kaggle/input/geoworld/user_info.test')

# Get datasets from test file
text_ds_test, location_ds_test, testData = loadData(data_path)

model.evaluate(testData.batch(512))