<a href="https://colab.research.google.com/github/COGS118A/Group012-Sp22/blob/main/COGS_118A_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import Relevant Libraries

In [None]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
import string
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Import Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
ls drive/MyDrive/'Courses UCSD'/'SPRING 2022'/'COGS 118A'/'FinalProject'/'reddit_dataset'

[0m[01;34madhd[0m/  [01;34malcoholism[0m/  [01;34manxiety[0m/  [01;34mdepression[0m/


In [None]:
data_path = 'drive/MyDrive/Courses UCSD/SPRING 2022/COGS 118A/FinalProject/reddit_dataset'

## Datasets to Dataframes

Import all of the data files for each disorder and merge them into one dataframe

ADHD DATA

In [None]:
# import data from drive
adhd_2018_df = pd.read_csv(data_path + '/adhd/adhd_2018.csv')
adhd_2019_df = pd.read_csv(data_path + '/adhd/adhd_2019.csv')
adhd_post_df = pd.read_csv(data_path + '/adhd/adhd_post.csv')
adhd_pre_df  = pd.read_csv(data_path + '/adhd/adhd_pre.csv')

# join all data into one DataFrame
adhd_dataset = pd.concat([adhd_2018_df, adhd_2019_df, adhd_post_df, adhd_pre_df])

ANXIETY DATA

In [None]:
# import data from drive
anxiety_2018_df = pd.read_csv(data_path + '/anxiety/anxiety_2018.csv')
anxiety_2019_df = pd.read_csv(data_path + '/anxiety/anxiety_2019.csv')
anxiety_post_df = pd.read_csv(data_path + '/anxiety/anxiety_post.csv')
anxiety_pre_df  = pd.read_csv(data_path + '/anxiety/anxiety_pre.csv')

# join all data into one DataFrame
anxiety_dataset = pd.concat([anxiety_2018_df, anxiety_2019_df, anxiety_post_df, anxiety_pre_df])

ALCOHOLISM DATA

In [None]:
# import data from drive
alcoholism_2018_df = pd.read_csv(data_path + '/alcoholism/alcoholism_2018.csv')
alcoholism_2019_df = pd.read_csv(data_path + '/alcoholism/alcoholism_2019.csv')
alcoholism_post_df = pd.read_csv(data_path + '/alcoholism/alcoholism_post.csv')
alcoholism_pre_df  = pd.read_csv(data_path + '/alcoholism/alcoholism_pre.csv')

# join all data into one DataFrame
alcoholism_dataset = pd.concat([alcoholism_2018_df, alcoholism_2019_df, alcoholism_post_df, alcoholism_pre_df])

DEPRESSION DATA

In [None]:
# import data from drive
depression_2018_df = pd.read_csv(data_path + "/depression/depression_2018.csv")
depression_2019_df = pd.read_csv(data_path + "/depression/depression_2019.csv")
depression_post_df = pd.read_csv(data_path + "/depression/depression_post.csv")
depression_pre_df  = pd.read_csv(data_path + "/depression/depression_pre.csv")

# join all data into one DataFrame
depression_dataset = pd.concat([depression_2018_df, depression_2019_df, depression_post_df, depression_pre_df])

In [None]:
# length of our data
print('adhd', len(adhd_dataset))
print('anxiety', len(anxiety_dataset))
print('alcoholim', len(alcoholism_dataset))
print('depression', len(depression_dataset))

adhd 45631
anxiety 57671
alcoholim 5911
depression 117331


### Example of a post from a dataset

Our data consists of reddit posts organized into distinct mental health support groups, no labels.

In [None]:
adhd_post = adhd_dataset.loc[:, "post"][1]
adhd_post

1    Concerta not working on the first day?! Update...
1    I was doing so well... I was diagnosed back in...
1    The First Step of a multi-step task Perhaps yo...
1    ADHD &amp; Bipolar Anyone else have Bipolar Di...
Name: post, dtype: object

### Data Noise Reduction and Format Simplification

Let's isolate our noisy dataframes into a more neutral list format, with only the reddit posts as each data point.

In [None]:
adhd_posts = [i for i in adhd_dataset.loc[:, "post"]]
anxiety_posts = [i for i in anxiety_dataset.loc[:, "post"]]
alcoholism_posts = [i for i in alcoholism_dataset.loc[:, "post"]]
depression_posts = [i for i in depression_dataset.loc[:, "post"]]

# Data Cleaning and Preprocessing

In [None]:
#functions to remove stopwords from posts

def remove_stops(text, stops):
  words = text.split()
  final = []
  for word in words:
    if word not in stops:
      final.append(word)
  final = " ".join(final)
  final = final.translate(str.maketrans("", "", string.punctuation))
  final = "".join([i for i in final if not i.isdigit()])
  while "  " in final:
    final = final.replace("  ", " ")
  return final

def clean_docs(docs):
  stops = stopwords.words("english")
  final = []
  final2 = []
  for doc in docs:
    clean_doc = remove_stops(doc, stops)
    final.append(clean_doc)

  return final

Clean all of our datasets

In [None]:
cleaned_adhd_docs = clean_docs(adhd_posts)
cleaned_anxiety_docs = clean_docs(anxiety_posts)
cleaned_alcoholism_docs = clean_docs(alcoholism_posts)
cleaned_depression_docs = clean_docs(depression_posts)

In [None]:
#check for an data loss

print('adhd', len(cleaned_adhd_docs))
print('anxiety', len(cleaned_anxiety_docs))
print('alcoholism', len(cleaned_alcoholism_docs))
print('depression', len(cleaned_depression_docs))

adhd 45631
anxiety 57671
alcoholism 5911
depression 117331


## Compare Clean vs Unclean Samples

### From our ADHD Dataset

In [None]:
adhd_posts[1]

'Concerta not working on the first day?! Update!: Thank you all for your insightful and kind responses, if not for them I would have chickened out ONCE AGAIN! I took my meds (Concerta 18mg) at 3:15 and I feel absolutely NOTHING! I know some people say they feel nothing and perhaps there’s a slight change or feeling/“buzz” but I feel ABSOLUTELY NOTHING. I feel exactly the same as I did prior to taking it. I actually just took a nap otherwise I would have updated earlier with this post:)\n\nI should mention that prior to taking my meds I took some vitamins including:\nB Complex\nVitamin D3\nZinc\nOmega 3-6\n\nDoes this have any affect on the Concerta? \nAlso is there any point in continuing or should I simply get a higher dose? \n\nAs always, Any advice would be greatly appreciated:) '

In [None]:
cleaned_adhd_docs[1]

'Concerta working first day Update Thank insightful kind responses I would chickened ONCE AGAIN I took meds Concerta mg I feel absolutely NOTHING I know people say feel nothing perhaps there’s slight change feeling“buzz” I feel ABSOLUTELY NOTHING I feel exactly I prior taking it I actually took nap otherwise I would updated earlier post I mention prior taking meds I took vitamins including B Complex Vitamin D Zinc Omega Does affect Concerta Also point continuing I simply get higher dose As always Any advice would greatly appreciated'

### From our Alcoholism Dataset

In [None]:
alcoholism_posts[1]

'It’s 1:30am. So glad to be sober. I was at a New Years party tonight.  All the adults got wasted and had a good time.   I did not get wasted and I had a good time. \n\n\nTomorrow I’ll be feeling good.  Not sure my fellow partiers will.  \n\n\nSo glad to be sober. '

In [None]:
cleaned_alcoholism_docs[1]

'It’s am So glad sober I New Years party tonight All adults got wasted good time I get wasted I good time Tomorrow I’ll feeling good Not sure fellow partiers will So glad sober'

### TF-IDF

In [None]:

from sklearn.feature_extraction import text

custom_stopwords = 'drive/MyDrive/Courses UCSD/SPRING 2022/COGS 118A/FinalProject/stop_words_english.txt'

with open(custom_stopwords, 'r') as f:
    more_stop_words = [line.strip() for line in f]
my_stop_words = text.ENGLISH_STOP_WORDS.union(more_stop_words)

vectorizer = TfidfVectorizer(
                                lowercase=True,
                                max_features=300,
                                stop_words=my_stop_words)

vectors = vectorizer.fit_transform(cleaned_adhd_docs)
feature_names = vectorizer.get_feature_names_out()
dense = vectors.todense()
denselist = dense.tolist()
adhd_tfidf = pd.DataFrame(denselist, columns=feature_names)

  % sorted(inconsistent)


In [None]:
adhd_tfidf

In [None]:
#visualize only the keywords

all_keywords = []

for description in denselist:
  x = 0
  keywords = []
  for word in description:
    if word > 0:
      keywords.append(feature_names[x])
    x = x+1
  all_keywords.append(keywords)

In [None]:
print(all_keywords[1])

In [None]:
## if we are interested in the n features with the highest TF IDF scores

top_n = 300
top_n_features = sorted(list(zip(feature_names, 
                                  vectors.sum(0).getA1())), 
                              key=lambda x: x[1], reverse=True)[:top_n]

In [None]:
#top_n_features

In [None]:
for feature in top_n_features:
  if feature[0] == 'suicide':
    print(feature)

In [None]:
# Extract the TF-IDF seed words from the 2018 depression dataset
my_seed_words = []
for feature in top_n_features:
  my_seed_words.append(feature[0])
print(my_seed_words)

todo: go through all studies and create a comprehensive list of all keywords

In [None]:
# TF-IDF seed words from existing study
depression_true_seed_words = ['myself', 'really', 'depression', 'hope', 'life', 'forever', 'pain', 'sad', 'live', 'mood']

In [None]:
# Calculate Cosine Similarity between the two seed word lists
from collections import Counter

# count word occurrences
our_vals = Counter(my_seed_words)
true_vals = Counter(depression_true_seed_words)

# convert to word-vectors
words  = list(our_vals.keys() | true_vals.keys())
our_vect = [our_vals.get(word, 0) for word in words]        # [0, 0, 1, 1, 2, 1]
true_vect = [true_vals.get(word, 0) for word in words]        # [1, 1, 1, 0, 1, 0]

# find cosine
len_our  = sum(av*av for av in our_vect) ** 0.5             # sqrt(7)
len_true  = sum(bv*bv for bv in true_vect) ** 0.5             # sqrt(4)
dot    = sum(av*bv for av,bv in zip(our_vect, true_vect))    # 3
cosine = dot / (len_our * len_true)                          # 0.5669467


In [None]:
print(cosine)

In [None]:
depression_lexicon = ['depressed', 'tired', 'anxious', 'sleep', 'insomnia', 'sad', 'meaningless','goodbye', 'pointless', 'angry', 'suicidal']

# Self-classification with Keras (Masked Language Modeling)

The following code creates labels automatically using Masked Language Modeling, with one Neural Network layer for our dataset, since reddit posts unfortunately do not come prelabeled. 

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization
from dataclasses import dataclass
import re
from pprint import pprint

This is a test dummy dataset only, labels are fake

In [None]:
# make the adhd features into a tensor
adhd_features = tf.constant(cleaned_adhd_docs)

# make the adhd labels into a tensor
# these are temporary labels just to make a temporary dataset
adhd_labels = tf.constant(np.random.choice([0, 1], size=(len(cleaned_adhd_docs),), p=[1./3, 2./3]))

# initialize a tensorflow dataset for text features and labels
# wel will use this dataset to extract a lexicon out of all data samples
# so that we can train a neural network with it
adhd_dataset = tf.data.Dataset.from_tensor_slices((adhd_features, adhd_labels))

In [None]:
## this just displays the first couple data points and their classification label

for text_batch, label_batch in adhd_dataset.take(3):
        print(text_batch.numpy())
        print(label_batch.numpy())

b'LethargicDepressed meds First Ill give background medical history I struggled depression years nowIm currently would bouts crying absolute consumption life every week so About months ago I diagnosed ADD started taking mg xr adderall morning noon About montg starting I noticed I longer bouts depression overall happier My mom super anti meds blames everything bad adderall take away randomly Every time I feel withdrawals bouts depression come back I feel like I want lie die Does anyone else feel way me'
1
b'Concerta working first day Update Thank insightful kind responses I would chickened ONCE AGAIN I took meds Concerta mg I feel absolutely NOTHING I know people say feel nothing perhaps there\xe2\x80\x99s slight change feeling\xe2\x80\x9cbuzz\xe2\x80\x9d I feel ABSOLUTELY NOTHING I feel exactly I prior taking it I actually took nap otherwise I would updated earlier post I mention prior taking meds I took vitamins including B Complex Vitamin D Zinc Omega Does affect Concerta Also point 

### Dataset Prep: Vocabulary and Mask Layer

In [None]:
@dataclass
class Config:
    MAX_LEN = 256
    BATCH_SIZE = 32
    LR = 0.001
    VOCAB_SIZE = 30000
    EMBED_DIM = 128
    NUM_HEAD = 8  # used in bert model
    FF_DIM = 128  # used in bert model
    NUM_LAYERS = 1


config = Config()

In [None]:
## data cleaning from capitalization and symbols

def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    return tf.strings.regex_replace(
        stripped_html, f"[{re.escape(string.punctuation)}]", ""
    )

In [None]:
def get_vectorize_layer(texts, vocab_size, max_seq, special_tokens=["[MASK]"]):
    """Build Text vectorization layer

    Args:
      texts (list): List of string i.e input texts
      vocab_size (int): vocab size
      max_seq (int): Maximum sequence lenght.
      special_tokens (list, optional): List of special tokens. Defaults to ['[MASK]'].

    Returns:
        layers.Layer: Return TextVectorization Keras Layer
    """

    # initialize vocabulary layer, creates a lexicon to adapt our model with
    vectorize_layer = TextVectorization(
        max_tokens=vocab_size,
        output_mode="int",
        standardize=custom_standardization,
        output_sequence_length=max_seq,
    )

    # use the entire dataset (no labels) and create a useful lexicon out of it:
    vectorize_layer.adapt(texts)

    # Insert mask token in vocabulary
    vocab = vectorize_layer.get_vocabulary()
    vocab = vocab[2 : vocab_size - len(special_tokens)] + ["[mask]"]
    vectorize_layer.set_vocabulary(vocab)
    return vectorize_layer

## Text Vectorization with Vocabulary Layer

- the layer will build a vocabulary of all string tokens seen in the dataset, sorted by occurance count, with ties broken by sort order of the tokens (high to low). 
- Will compute the most frequent tokens occurring in the input dataset.
- We use this 'vocab' to train our model with

In [None]:
'''ADHD'''
# run our cleaned ADHD data through the vocab layer
adhd_vectorize_layer = get_vectorize_layer(
    cleaned_adhd_docs,
    config.VOCAB_SIZE,
    config.MAX_LEN,
    special_tokens=["[mask]"],
)

# Get mask token id for masked language model
adhd_mask_token_id = adhd_vectorize_layer(["[mask]"]).numpy()[0][0]

In [None]:
'''ANXIETY'''
# run our cleaned ADHD data through the vocab layer
anxiety_vectorize_layer = get_vectorize_layer(
    cleaned_anxiety_docs,
    config.VOCAB_SIZE,
    config.MAX_LEN,
    special_tokens=["[mask]"],
)

# Get mask token id for masked language model
anxiety_mask_token_id = anxiety_vectorize_layer(["[mask]"]).numpy()[0][0]

In [None]:
'''ALCOHOLISM'''
# run our cleaned ADHD data through the vocab layer
alcoholism_vectorize_layer = get_vectorize_layer(
    cleaned_alcoholism_docs,
    config.VOCAB_SIZE,
    config.MAX_LEN,
    special_tokens=["[mask]"],
)

# Get mask token id for masked language model
alcoholism_mask_token_id = alcoholism_vectorize_layer(["[mask]"]).numpy()[0][0]

In [None]:
# NOT ENOUGH RAM ON COLAB

# '''DEPRESSION'''
# # run our cleaned ADHD data through the vocab layer
# depression_vectorize_layer = get_vectorize_layer(
#     cleaned_depression_docs,
#     config.VOCAB_SIZE,
#     config.MAX_LEN,
#     special_tokens=["[mask]"],
# )

# # Get mask token id for masked language model
# depression_mask_token_id = depression_vectorize_layer(["[mask]"]).numpy()[0][0]

Encoding and Self-Classification with Masked Language Modeling

Code sample from Keras Official Documentation: https://keras.io/examples/nlp/masked_language_modeling/

In [None]:
'''
This is the function which creates automatic labels for our dataset
by using the vectorization and vocab layer we created previously
'''
def get_masked_input_and_labels(encoded_texts, mask_token_id):
    # 15% BERT masking
    inp_mask = np.random.rand(*encoded_texts.shape) < 0.15
    # Do not mask special tokens
    inp_mask[encoded_texts <= 2] = False
    # Set targets to -1 by default, it means ignore
    labels = -1 * np.ones(encoded_texts.shape, dtype=int)
    # Set labels for masked tokens
    labels[inp_mask] = encoded_texts[inp_mask]

    # Prepare input
    encoded_texts_masked = np.copy(encoded_texts)
    # Set input to [MASK] which is the last token for the 90% of tokens
    # This means leaving 10% unchanged
    inp_mask_2mask = inp_mask & (np.random.rand(*encoded_texts.shape) < 0.90)
    encoded_texts_masked[
        inp_mask_2mask
    ] = mask_token_id  # mask token is the last in the dict

    # Set 10% to a random token
    inp_mask_2random = inp_mask_2mask & (np.random.rand(*encoded_texts.shape) < 1 / 9)
    encoded_texts_masked[inp_mask_2random] = np.random.randint(
        3, mask_token_id, inp_mask_2random.sum()
    )

    # Prepare sample_weights to pass to .fit() method
    sample_weights = np.ones(labels.shape)
    sample_weights[labels == -1] = 0

    # y_labels would be same as encoded_texts i.e input tokens
    y_labels = np.copy(encoded_texts)

    return encoded_texts_masked, y_labels, sample_weights

### CREATE AUTOMATIC LABELS

ADHD

In [None]:
# Prepare data for masked language model for the unlabeled ADHD dataset
x_all_adhd = adhd_vectorize_layer(cleaned_adhd_docs).numpy()
x_masked_adhd_train, y_masked_adhd_labels, adhd_sample_weights = get_masked_input_and_labels(
    x_all_adhd, adhd_mask_token_id)

# formulate our new self-labeled dataset
mlm_adhd_ds = tf.data.Dataset.from_tensor_slices(
    (x_masked_adhd_train, y_masked_adhd_labels, adhd_sample_weights))
mlm_adhd_ds = mlm_adhd_ds.shuffle(1000).batch(config.BATCH_SIZE)

ANXIETY

In [None]:
# Prepare data for masked language model for the unlabeled anxiety dataset
x_all_anxiety = anxiety_vectorize_layer(cleaned_anxiety_docs).numpy()
x_masked_anxiety_train, y_masked_anxiety_labels, anxiety_sample_weights = get_masked_input_and_labels(
    x_all_anxiety, anxiety_mask_token_id)

# formulate our new self-labeled dataset
mlm_anxiety_ds = tf.data.Dataset.from_tensor_slices(
    (x_masked_anxiety_train, y_masked_anxiety_labels, anxiety_sample_weights))
mlm_anxiety_ds = mlm_anxiety_ds.shuffle(1000).batch(config.BATCH_SIZE)

ALCOHOLISM

In [None]:
# Prepare data for masked language model for the unlabeled alcoholism dataset
x_all_alcoholism = alcoholism_vectorize_layer(cleaned_alcoholism_docs).numpy()
x_masked_alcoholism_train, y_masked_alcoholism_labels, alcoholism_sample_weights = get_masked_input_and_labels(
    x_all_alcoholism, alcoholism_mask_token_id)

# formulate our new self-labeled dataset
mlm_alcoholism_ds = tf.data.Dataset.from_tensor_slices(
    (x_masked_alcoholism_train, y_masked_alcoholism_labels, alcoholism_sample_weights))
mlm_alcoholism_ds = mlm_alcoholism_ds.shuffle(1000).batch(config.BATCH_SIZE)

DEPRESSION

NOTE: colab crashes for this big dataset due to lack of ram

TODO: try to run this notebook on high ram datahub

In [None]:
# # Prepare data for masked language model for the unlabeled depression dataset
# x_all_depression = depression_vectorize_layer(cleaned_depression_docs).numpy()
# x_masked_depression_train, y_masked_depression_labels, depression_sample_weights = get_masked_input_and_labels(
#     x_all_depression, depression_mask_token_id)

# # formulate our new self-labeled dataset
# mlm_depression_ds = tf.data.Dataset.from_tensor_slices(
#     (x_masked_depression_train, y_masked_depression_labels, depression_sample_weights))
# mlm_depression_ds = mlm_depression_ds.shuffle(1000).batch(config.BATCH_SIZE)

We now have a labeled ADHD dataset

In [None]:
# length of our data
print('adhd', len(mlm_adhd_ds))
print('anxiety', len(mlm_anxiety_ds))
print('alcoholism', len(mlm_alcoholism_ds))

adhd 1426
anxiety 1803
alcoholism 185


## Create BERT model (Pretraining Model) for masked language modeling

It will take token ids as inputs (including masked tokens) and it will predict the correct ids for the masked input tokens.

Code sample from Keras Official Documentation: https://keras.io/examples/nlp/masked_language_modeling/

In [None]:
## please note this bert module is from Keras Documentation
## it is included here because tensorflow or keras do not have a set of
## functions we can just use for this, we have to include them here

def bert_module(query, key, value, i):
    # Multi headed self-attention
    attention_output = layers.MultiHeadAttention(
        num_heads=config.NUM_HEAD,
        key_dim=config.EMBED_DIM // config.NUM_HEAD,
        name="encoder_{}/multiheadattention".format(i),
    )(query, key, value)
    attention_output = layers.Dropout(0.1, name="encoder_{}/att_dropout".format(i))(
        attention_output
    )
    attention_output = layers.LayerNormalization(
        epsilon=1e-6, name="encoder_{}/att_layernormalization".format(i)
    )(query + attention_output)

    # Feed-forward layer
    ffn = keras.Sequential(
        [
            layers.Dense(config.FF_DIM, activation="relu"),
            layers.Dense(config.EMBED_DIM),
        ],
        name="encoder_{}/ffn".format(i),
    )
    ffn_output = ffn(attention_output)
    ffn_output = layers.Dropout(0.1, name="encoder_{}/ffn_dropout".format(i))(
        ffn_output
    )
    sequence_output = layers.LayerNormalization(
        epsilon=1e-6, name="encoder_{}/ffn_layernormalization".format(i)
    )(attention_output + ffn_output)
    return sequence_output


def get_pos_encoding_matrix(max_len, d_emb):
    pos_enc = np.array(
        [
            [pos / np.power(10000, 2 * (j // 2) / d_emb) for j in range(d_emb)]
            if pos != 0
            else np.zeros(d_emb)
            for pos in range(max_len)
        ]
    )
    pos_enc[1:, 0::2] = np.sin(pos_enc[1:, 0::2])  # dim 2i
    pos_enc[1:, 1::2] = np.cos(pos_enc[1:, 1::2])  # dim 2i+1
    return pos_enc


loss_fn = keras.losses.SparseCategoricalCrossentropy(
    reduction=tf.keras.losses.Reduction.NONE
)
loss_tracker = tf.keras.metrics.Mean(name="loss")


class MaskedLanguageModel(tf.keras.Model):
    def train_step(self, inputs):
        if len(inputs) == 3:
            features, labels, sample_weight = inputs
        else:
            features, labels = inputs
            sample_weight = None

        with tf.GradientTape() as tape:
            predictions = self(features, training=True)
            loss = loss_fn(labels, predictions, sample_weight=sample_weight)

        # Compute gradients
        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)

        # Update weights
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))

        # Compute our own metrics
        loss_tracker.update_state(loss, sample_weight=sample_weight)

        # Return a dict mapping metric names to current value
        return {"loss": loss_tracker.result()}

    @property
    def metrics(self):
        # We list our `Metric` objects here so that `reset_states()` can be
        # called automatically at the start of each epoch
        # or at the start of `evaluate()`.
        # If you don't implement this property, you have to call
        # `reset_states()` yourself at the time of your choosing.
        return [loss_tracker]


def create_masked_language_bert_model():
    inputs = layers.Input((config.MAX_LEN,), dtype=tf.int64)

    word_embeddings = layers.Embedding(
        config.VOCAB_SIZE, config.EMBED_DIM, name="word_embedding"
    )(inputs)
    position_embeddings = layers.Embedding(
        input_dim=config.MAX_LEN,
        output_dim=config.EMBED_DIM,
        weights=[get_pos_encoding_matrix(config.MAX_LEN, config.EMBED_DIM)],
        name="position_embedding",
    )(tf.range(start=0, limit=config.MAX_LEN, delta=1))
    embeddings = word_embeddings + position_embeddings

    encoder_output = embeddings
    for i in range(config.NUM_LAYERS):
        encoder_output = bert_module(encoder_output, encoder_output, encoder_output, i)

    mlm_output = layers.Dense(config.VOCAB_SIZE, name="mlm_cls", activation="softmax")(
        encoder_output
    )
    mlm_model = MaskedLanguageModel(inputs, mlm_output, name="masked_bert_model")

    optimizer = keras.optimizers.Adam(learning_rate=config.LR)
    mlm_model.compile(optimizer=optimizer)
    return mlm_model


id2token = dict(enumerate(alcoholism_vectorize_layer.get_vocabulary()))
token2id = {y: x for x, y in id2token.items()}
mask_token_id = alcoholism_mask_token_id

# anxiety_id2token = dict(enumerate(anxiety_vectorize_layer.get_vocabulary()))
# anxiety_token2id = {y: x for x, y in anxiety_id2token.items()}

# alcoholism_id2token = dict(enumerate(alcoholism_vectorize_layer.get_vocabulary()))
# alcoholism_token2id = {y: x for x, y in alcoholism_id2token.items()}

# depression_id2token = dict(enumerate(depression_vectorize_layer.get_vocabulary()))
# depression_token2id = {y: x for x, y in depression_id2token.items()}



## optional text generator
class MaskedTextGenerator(keras.callbacks.Callback):
    def __init__(self, sample_tokens, top_k=5):
        self.sample_tokens = sample_tokens
        self.k = top_k

    def decode(self, tokens):
        return " ".join([id2token[t] for t in tokens if t != 0])

    def convert_ids_to_tokens(self, id):
        return id2token[id]

    def on_epoch_end(self, epoch, logs=None):
        prediction = self.model.predict(self.sample_tokens)

        masked_index = np.where(self.sample_tokens == mask_token_id)
        masked_index = masked_index[1]
        mask_prediction = prediction[0][masked_index]

        top_indices = mask_prediction[0].argsort()[-self.k :][::-1]
        values = mask_prediction[0][top_indices]

        for i in range(len(top_indices)):
            p = top_indices[i]
            v = values[i]
            tokens = np.copy(sample_tokens[0])
            tokens[masked_index[0]] = p
            result = {
                "input_text": self.decode(sample_tokens[0].numpy()),
                "prediction": self.decode(tokens),
                "probability": v,
                "predicted mask token": self.convert_ids_to_tokens(p),
            }
            pprint(result)

In [None]:
## this callback can show us the evolution of our training
adhd_sample_tokens = adhd_vectorize_layer(["Lately I have been feeling [mask] and I do not know what to do"])
generator_callback = MaskedTextGenerator(adhd_sample_tokens.numpy())

bert_masked_adhd_model = create_masked_language_bert_model()
bert_masked_adhd_model.summary()

Model: "masked_bert_model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 256)]        0           []                               
                                                                                                  
 word_embedding (Embedding)     (None, 256, 128)     3840000     ['input_3[0][0]']                
                                                                                                  
 tf.__operators__.add_6 (TFOpLa  (None, 256, 128)    0           ['word_embedding[0][0]']         
 mbda)                                                                                            
                                                                                                  
 encoder_0/multiheadattention (  (None, 256, 128)    66048       ['tf.__operators_

In [None]:
#bert model on a much smaller dataset

## this callback can show us the evolution of our training
alcoholism_sample_tokens = alcoholism_vectorize_layer(["I am so happy to be [mask] now. Daily drinking was ruining my life."])
generator_callback = MaskedTextGenerator(alcoholism_sample_tokens.numpy())

bert_masked_alcoholism_model = create_masked_language_bert_model()
bert_masked_alcoholism_model.summary()

Model: "masked_bert_model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_4 (InputLayer)           [(None, 256)]        0           []                               
                                                                                                  
 word_embedding (Embedding)     (None, 256, 128)     3840000     ['input_4[0][0]']                
                                                                                                  
 tf.__operators__.add_9 (TFOpLa  (None, 256, 128)    0           ['word_embedding[0][0]']         
 mbda)                                                                                            
                                                                                                  
 encoder_0/multiheadattention (  (None, 256, 128)    66048       ['tf.__operators_

## Train and save the model

In [None]:
# unfortunately takes like 15 hours

# bert_masked_adhd_model.fit(mlm_adhd_ds, epochs=2, callbacks=[generator_callback])
# bert_masked_adhd_model.save(data_path + "/adhd/bert_mlm_adhd.h5")

train on the alcoholism dataset, much smaller

In [None]:
# unfortunately takes like 15 hours

# bert_masked_alcoholism_model.fit(mlm_alcoholism_ds, epochs=2, callbacks=[generator_callback])
# bert_masked_alcoholism_model.save(data_path + "/alcoholism/bert_mlm_alcoholism.h5")

In [None]:
for text_batch, label_batch, weights in mlm_adhd_ds.take(1):
    for i in range(1):
        print('text vector\n', text_batch.numpy()[i])
        print('label\n', label_batch.numpy()[i])
        

text vector
 [ 4472    29  9761   176    84   528  4267  7077 13939     1     1  4267
  5831  4472   456  4267   197    33   806   306   345   505  1018 18690
  5300  4472   720  1860  7740    16  1319  2176    74     8     2   800
     6  5300   305   290    42   110   200   206     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0  

### Split Our Data

In [None]:
# Our model previously split 32% of the data for this auto-classification task

DATASET_SIZE = len(list(mlm_adhd_ds))
DATASET_SIZE

1426

In [None]:
# this piece of code lets us control the train/test split
# it splits the adhd tensorflow dataset and splits it

train_size = int(0.7 * DATASET_SIZE)
test_size = int(0.3 * DATASET_SIZE)

adhd_train_dataset = mlm_adhd_ds.take(train_size)
adhd_test_dataset = mlm_adhd_ds.skip(train_size)

print("train", len(list(adhd_train_dataset)))
print("test", len(list(adhd_test_dataset)))

train 998
test 428


In [None]:
# split tensorflow datasets into x and y lists to use for sklearn

adhd_train_y = []
adhd_train_x = []

# training data split into text vectorizations and vectorized labels
adhd_train_x = np.array([list(x[0].numpy()) for x in list(adhd_train_dataset)])
adhd_train_y = np.array([x[1].numpy() for x in list(adhd_train_dataset)])

adhd_train_x[0], adhd_train_y[0]

(array([[ 5279,  8827,  4267, ...,     0,     0,     0],
        [  275, 27435,  4267, ...,     0,     0,     0],
        [    2,  3000,    19, ...,     0,     0,     0],
        ...,
        [ 1766,   436,  4267, ...,  3258,   184,   206],
        [    3,   107,     3, ...,     0,     0,     0],
        [  213,    40,    49, ...,     0,     0,     0]]),
 array([[ 1503,    46,  1367, ...,     0,     0,     0],
        [    4,   444,    72, ...,     0,     0,     0],
        [   10,   361,  2219, ...,     0,     0,     0],
        ...,
        [   29,   186,   813, ...,     0,     0,     0],
        [  882,   190,   379, ...,     0,     0,     0],
        [  184,  2710, 10017, ...,     0,     0,     0]]))

In [None]:
# just view the length of our dataset, make sure its the right number of training samples

len(adhd_train_x), len(adhd_train_y)

(998, 998)

In [None]:
# PROBLEM
# we have a dataset with a very weird shape, making it difficult to put into
# any sklearn fit() function. We need to reduce the dimensionality of the data
# somehow to be able to train with it

np.array(adhd_train_x).shape, np.array(adhd_train_y).shape

((998, 32, 256), (998, 32, 256))

TODO: next step would be to just throw the training set into a sklearn algo and then
predict on the test set. The problem here is the shape of the data.

kNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(adhd_train_x, adhd_train_y)

ValueError: ignored

Support Vector Machine

In [None]:
from sklearn.svm import SVC

clf = SVC(C=1.0, kernel='linear', degree=3, gamma='auto')

clf.fit(adhd_train_x, adhd_train_y)

ValueError: ignored

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

lin_svm = OneVsRestClassifier(LinearSVC(random_state=0)).fit(adhd_train_x, adhd_train_y)

ValueError: ignored

# Create a small hand-labelled dataset to test with

In [None]:
''' 
  this step requires repeating the tensorflow dataset creation from earlier
  this time the labels are correct, and we are selecting a much smaller
  random subset of the data for training, which is hand-labeled
'''


random.sample(cleaned_adhd_docs, n)

# make the adhd features into a tensor
adhd_features = tf.constant(cleaned_adhd_docs)
adhd_true_labels = tf.constant([])